Objective: Summary of the demographics and soil characteristics of the Rwanda long term soil health study.

Study methodology

Add in details and links on study methodology here.

library(knitr)
library(ggplot2)
library(stringr)
suppressMessages(library(dplyr))
library(sp)
suppressMessages(library(rgdal))
suppressMessages(library(dismo))
suppressMessages(library(stargazer))
library(leaflet)
library(XML)
suppressMessages(library(maptools))
library(automap)
suppressMessages(library(RStata))
suppressMessages(library(fields))
library(gstat)
library(htmltools)
suppressMessages(library(Matching))
library(reshape2)
options("RStata.StataVersion" = 12)
options("RStata.StataPath" = "/Applications/Stata/StataSE.app/Contents/MacOS/stata-se")
#chooseStataBin("/Applications/Stata/StataSE.app/Contents/MacOS/stata-se")
wd <- "/Users/mlowes/drive/optimized_agronomy/soil/soil_health_study/data/rw baseline"
dd <- paste(wd, "data", sep = "/")
od <- paste(wd, "output", sep="/")
md <- paste(wd, "maps", sep="/")
drive <- "~/drive/r_help/4_output/statistical_test_outputs"

#load data:
# This data is being drawn from the Soil lab repository. It has the baseline data with it
# d <- read.csv(paste(dd, "Rwanda_shs_commcare_soil_data_final.csv", sep="/"),  stringsAsFactors=FALSE)

Combine survey and soil data

Replicate Alex’s and Emmanuel’s merge process using “Identifiers with SSN_final” provided by Emmanuel. I use the RStata package found here

# I'm replicating Alex's do file here.
stata("merge_shs.do")
## . * merge raw commcare data with soil database
## . 
## . * date: 11 july 2016
## . 
## . clear all
## . set more off
## . 
## . * set directory
## . global wd "/Users/mlowes/drive/soil health study/data/rw baseline"
## . global dd "/Users/mlowes/drive/soil health study/data/rw baseline/data"
## . global troubleshoot "/Users/mlowes/drive/soil health study/data/rw baseline/t
## > roubleshoot"
## . 
## . * insheet data
## . insheet using "$dd/raw_rwanda_commcare_shs.csv", clear
## file /Users/mlowes/drive/soil health study/data/rw
##     baseline/data/raw_rwanda_commcare_shs.csv not found
## r(601);
## . 
## . * clean sample_id variable
## . replace sample_id = "" if sample_id == "---"
## . replace sample_id = "-99" if sample_id == "99"
## . 
## . destring sample_id, replace
## . 
## . * manipulate current "sample_id" to become a proper alpha-numeric unique iden
## > tifier
## . * this simply involves adding "C" to each numeric code that belongs to a cont
## > rol farmer
## . 
## . * variables of interest:
## .       * d_client
## .       * sample_id
## . 
## . ren demographic_infod_client d_client
## . replace d_client = "" if d_client == "---"
## . destring d_client, replace
## . 
## . tostring sample_id, gen(sample_id_string)
## . 
## . replace sample_id_string = sample_id_string + "C" if d_client == 0
## . drop sample_id 
## . ren sample_id_string sample_id
## . 
## . ***** !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ***
## > ***
## . * quantify and note cases where sample_id appears more than twice
## .      * 39 codes appear 2x
## .      * 1 code appears 3x
## .      * 1 code appears 4x
## .      * 1 code appears 10x
## .      * 1 code appears 19x
## . 
## . duplicates report sample_id
## . duplicates tag sample_id, gen(n_duplicate_id)
## . gen d_id_problem = 1 if n_duplicate_id != 0
## . replace d_id_problem = 0 if missing(d_id_problem)
## . 
## . ** need to investigate 5% of observations (114 samples), i.e. d_id_problem = 
## > 1**
## . ***** !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ***
## > ***
## . 
## . * outsheet problematic observations and examine 1-by-1
## . sort sample_id
## . outsheet demographicnom_enqueteur district demographic_infocellule_selected d
## > emographic_infoumudugudu d_client demographic_infonom_cultivateur sample_id d
## > emographic_infosex demographic_infoage_cultivateur demographic_infon_telephon
## > e_cult d_id_problem using "$dd/shs_to_check.csv" if d_id_problem == 1, replac
## > e comma
## . 
## . **************** post-investigation correction of incorrect ids *************
## > ***
## . 
## . ren demographic_infonom_cultivateur farmer_name
## . ren demographic_infocellule_selected selected_cell
## . ren demographicnom_enqueteur enumerator_name
## . 
## . * drop test/accidentally double-entered observations
## . drop if infoformid == "f138897f-6e80-4b0a-9db3-ab2134cd9e51"
## . drop if farmer_name == "Mugisha"
## . drop if farmer_name == "Muhunde yoramu"
## . drop if farmer_name == "Test"
## . drop if farmer_name == "Billy"
## . drop if farmer_name == "Gakuba michel"
## . drop if farmer_name == "Renata"
## . drop if farmer_name == "Bub"
## . drop if farmer_name == "M"
## . drop if farmer_name == "Anita"
## . drop if farmer_name == "Jean"
## . drop if farmer_name == "Nyirazaninka lea"
## . drop if farmer_name == "Mugiraneza Pacifiue"
## . drop if farmer_name == "Nsengiyaremye Innocent"
## . 
## . * correct incorrectly recorded ids
## . replace sample_id = "880C" if farmer_name == "Nyiransanzineza Marie Rose"
## . replace sample_id = "811C" if farmer_name == "Ngirabatware  Daniel"
## . replace sample_id = "646C" if farmer_name == "Nyirasabwa anonciata"
## . replace sample_id = "624C" if farmer_name == "Nyirinkindi j Bosco"
## . replace sample_id = "560C" if farmer_name == "Mubirigi Bertin"
## . replace d_client = 0 if farmer_name == "Mubirigi Bertin"
## . replace sample_id = "375C" if farmer_name == "Hitimana ezecquiere"
## . replace sample_id = "322C" if farmer_name == "Nyirantabire arrivera"
## . replace sample_id = "2566C" if farmer_name == "MUSABYIMANA Colletta"
## . replace sample_id = "2399C" if farmer_name == "Nyirabajyambere anastisia"
## . replace d_client = 0 if farmer_name == "Nyirabajyambere anastisia"
## . replace sample_id = "2280C" if farmer_name == "Zihabake Simon"
## . replace sample_id = "2202C" if farmer_name == "mugiraneza pacifique"
## . replace sample_id = "1780C" if farmer_name == "Hakuzimana Theophile"
## . replace sample_id = "1757C" if farmer_name == "Bavugirije Feleciane"
## . replace sample_id = "1626C" if farmer_name == "Nyabivumu"
## . replace sample_id = "1575C" if farmer_name == "BAHIMBANDE Beriyana."
## . replace d_client = 0 if farmer_name == "BAHIMBANDE Beriyana."
## . replace sample_id = "1103C" if farmer_name == "Barirwanda Elyse"
## . replace sample_id = "1037C" if farmer_name == "Mukashema Faraziya"
## . replace d_client = 0 if farmer_name == "Mukashema Faraziya" 
## . replace sample_id = "3069" if farmer_name == "Ndababonye Silas"
## . replace sample_id = "2611" if farmer_name == "Sebazungu modeste"
## . replace d_client = 1 if farmer_name == "Sebazungu modeste"
## . replace sample_id = "2415" if farmer_name == "Nkaka joseph"
## . replace sample_id = "2412" if farmer_name == "Ndagijimana alexis"
## . replace sample_id = "2410" if farmer_name == "Sebuhoro elie"
## . replace sample_id = "2406" if farmer_name == "Mugisha ruth"
## . replace sample_id = "2405" if farmer_name == "Mboneza hasan"
## . replace sample_id = "2404" if farmer_name == "Ntibitonderwa veriane"
## . replace sample_id = "2399" if farmer_name == "Nyiraburakeye feresita"
## . replace sample_id = "2395" if farmer_name == "Uwimana danier"
## . replace sample_id = "2394" if farmer_name == "Bizinde silas"
## . replace sample_id = "2390" if farmer_name == "Nyirahabimvana keresesiya"
## . replace sample_id = "2388" if farmer_name == "Nyirasekerabanzi tharcilla"
## . replace sample_id = "2386" if farmer_name == "Ndererimana emmanuel"
## . replace sample_id = "2291" if farmer_name == "Ndacyayisenga Feresiyani"
## . replace sample_id = "2184" if farmer_name == "Muhawenimana Beretirida"
## . replace sample_id = "2145" if farmer_name == "Fapfakwita celestine"
## . replace sample_id = "2141" if farmer_name == "Nsangiranabo krizoroji"
## . replace sample_id = "2140" if farmer_name == "Nsabyumuremyi anakereti"
## . replace sample_id = "2136" if farmer_name == "Nyirantibitonderwa savoroniya"
## . replace sample_id = "2131" if farmer_name == "Hagenimana Ewufroni"
## . replace sample_id = "2066" if farmer_name == "Hitayezu vicent"
## . replace d_client = 1 if farmer_name == "Hitayezu vicent"
## . replace sample_id = "2044" if farmer_name == "Niyodushima janette"
## . replace d_client = 1 if farmer_name == "Niyodushima janette"
## . replace sample_id = "2024" if farmer_name == "Mukabatsinda veren A"
## . replace sample_id = "2020" if farmer_name == "Sibomana innocent"
## . replace sample_id = "2009" if farmer_name == "Nsabimana inyasi"
## . replace sample_id = "2002" if farmer_name == "Mateso elizabet"
## . replace sample_id = "1889" if farmer_name == "Niyonsaba Daniel"
## . replace sample_id = "1864" if farmer_name == "Mukankubana Souzanne"
## . replace sample_id = "1780" if farmer_name == "Nyirantihabose Annonciatha"
## . replace sample_id = "1771" if farmer_name == "Bizimungu Damascne"
## . replace sample_id = "1675" if farmer_name == "Bikorimana Isaac"
## . replace sample_id = "1103" if farmer_name == "Munyabarata japhette"
## . replace sample_id = "954" if farmer_name == "Hahirwabake silver"
## . replace sample_id = "541" if farmer_name == "Rugemintwaza Vianney"
## . replace sample_id = "454" if farmer_name == "Niyonizeye fororida"
## . replace sample_id = "407" if farmer_name == "Munyaneza jean pierre"
## . replace sample_id = "375" if farmer_name == "Munyarubuga nanias"
## . replace sample_id = "322" if farmer_name == "Gacandaga zackarie"
## . replace sample_id = "262" if farmer_name == "MUHIRWA J Damascene"
## . replace sample_id = "32" if farmer_name == "Nsanzemuhire"
## . 
## . *** re-run duplicates test
## . drop d_id_problem
## . drop n_duplicate_id
## . 
## . duplicates report sample_id
## . duplicates tag sample_id, gen(n_duplicate_id)
## . gen d_id_problem = 1 if n_duplicate_id != 0
## . replace d_id_problem = 0 if missing(d_id_problem)
## . 
## . * second try: outsheet problematic observations and examine 1-by-1
## . sort sample_id
## . outsheet using "$dd/shs_to_check_2.csv" if d_id_problem == 1, replace comma
## . 
## . * second try: correct incorrect ids
## . 
## . replace sample_id = "2202" if farmer_name == "Mukamurara scholastique"
## . replace d_client = 1 if farmer_name == "Mukamurara scholastique"
## . replace sample_id = "2195" if farmer_name == "Ndacyayisenga Feresiyani"
## . replace sample_id = "824" if farmer_name == "Ngirabatware  Daniel"
## . 
## . replace sample_id = "" if rownumber == 238
## . replace sample_id = "" if rownumber == 848
## . replace sample_id = "" if rownumber == 1257
## . replace sample_id = "" if rownumber == 1732
## . 
## . *** re-run duplicates test
## . drop d_id_problem
## . drop n_duplicate_id
## . 
## . duplicates report sample_id
## . duplicates tag sample_id, gen(n_duplicate_id)
## . gen d_id_problem = 1 if n_duplicate_id != 0
## . replace d_id_problem = 0 if missing(d_id_problem)
## . 
## . * third try: outsheet problematic observations
## . sort sample_id
## . outsheet using "$dd/shs_to_check_3.csv" if d_id_problem == 1, replace comma
## . 
## . *** 4 sets of duplicates remain (2 observations)
## . drop if d_id_problem == 1
## . save "$dd/rwanda_commcare_shs_clean.dta", replace
## . outsheet using "$dd/rwanda_commcare_shs_clean.csv", comma replace
## . 
## . * merge datasets
## . insheet using "$dd/mne_rwanda_database_shs.csv", clear
## . 
## . save "$dd/mne_rwanda_database_shs.dta", replace
## . 
## . use "$dd/rwanda_commcare_shs_clean.dta", clear
## . 
## . merge 1:1 sample_id using "$dd/mne_rwanda_database_shs.dta"
## . 
## .    * 93 observations unmatched... :(
## . 
## . * investigate unmatched observations
## . sort sample_id
## . outsheet using "$troubleshoot/unmatched_commcare.csv" if _merge == 1, comma r
## > eplace
## . outsheet using "$troubleshoot/unmatched_inventory.csv" if _merge == 2, comma 
## > replace
## . 
## . * make first round of corrections in master data
## . use "$dd/rwanda_commcare_shs_clean.dta", clear
## . 
## . replace sample_id = "2711C" if farmer_name == "Kamuhanda Elie"
## . replace sample_id = "260C" if farmer_name == "HATEGEKIMANA Mathias"
## . replace sample_id = "245" if farmer_name == "Kayiranga Michel"
## . replace sample_id = "1543" if farmer_name == "Ndagijimana Eliezel"
## . replace sample_id = "1543C" if farmer_name == "Mukantanganzwa odeta"
## . replace sample_id = "1632" if farmer_name == "Nyiranziguye Cecile"
## . replace sample_id = "2782" if farmer_name == "Mukamungu leocadie"
## . replace sample_id = "421C" if farmer_name == "Niyonizeye Francine"
## . replace sample_id = "828C" if farmer_name == "Mukamazimpaka Terese"
## . replace sample_id = "364" if farmer_name == "Mukamuyango verina"
## . replace sample_id = "364C" if farmer_name == "Mujawimana jeanne"
## . replace sample_id = "368" if farmer_name == "Nyiramanzi jean damascene"
## . replace sample_id = "368C" if farmer_name == "Karwiyegura rachel"
## . replace sample_id = "391" if farmer_name == "Mugemane augistin"
## . replace sample_id = "391C" if farmer_name == "Ngirishuti sumayire"
## . replace sample_id = "395" if farmer_name == "Niyomugabo amiel"
## . replace sample_id = "395C" if farmer_name == "Ntawiniga augustin"
## . replace sample_id = "396" if farmer_name == "Mpayimana philippe"
## . replace sample_id = "329" if farmer_name == "Mucyezangango emmanuel"
## . replace sample_id = "411" if farmer_name == "Nsabimana mathias"
## . replace sample_id = "411C" if farmer_name == "Singirankabo"
## . replace sample_id = "414" if farmer_name == "Rutaharama eldephonse"
## . replace sample_id = "329C" if farmer_name == "Mukamana josephine"
## . replace sample_id = "657" if farmer_name == "Nyandwi vincent"
## . replace sample_id = "569C" if farmer_name == "Uwimana Bercilla"
## . replace sample_id = "2051C" if farmer_name == "Ntambabazi Anastase"
## . replace sample_id = "2249" if farmer_name == "Uwineza valentine"
## . replace sample_id = "2360C" if farmer_name == "Bamporineza j deDieu"
## . replace sample_id = "2001" if farmer_name == "Nyiramakuba thanene"
## . replace sample_id = "2897C" if farmer_name == "Mukagatanazi Marianne"
## . replace sample_id = "2965C" if farmer_name == "Twizeyimana Theoneste"
## . 
## . drop if farmer_name == "Yjk"
## . drop if farmer_name == "Uwambajimana Agnes"
## . drop if farmer_name == "Tr"
## . 
## . replace sample_id = "2415C" if farmer_name == "Nyiramahirwe Frolance"
## . replace sample_id = "851" if farmer_name == "Bigirimana  Amon"
## . 
## . 
## . * duplicates check
## . drop d_id_problem
## . drop n_duplicate_id
## . 
## . duplicates report sample_id
## . duplicates tag sample_id, gen(n_duplicate_id)
## . gen d_id_problem = 1 if n_duplicate_id != 0
## . replace d_id_problem = 0 if missing(d_id_problem)
## . 
## . * second try: merge
## . merge 1:1 sample_id using "$dd/mne_rwanda_database_shs.dta"
## . 
## . * deal with 30 unmatched cases
## . sort sample_id
## . outsheet using "$troubleshoot/unmatched_commcare_2.csv" if _merge == 1, comma
## >  replace
## . outsheet using "$troubleshoot/unmatched_inventory_2.csv" if _merge == 2, comm
## > a replace
## . 
## . * 5 submitted surveys with no corresponding sample
## . *drop if _merge == 1
## . 
## . * 19 samples with no corresponding survey
## . *drop if _merge == 2
## . 
## . * outsheet merged database
## . outsheet using "$dd/rwanda_shs_baseline_data.csv", comma replace
## . * THIS IS THE CLEAN VERSION! ONLY USE THIS! DON'T USE OTHER THINGS!
## .

Import the results of Alex’s do file.

d <- read.csv(paste(dd, "rwanda_shs_baseline_data.csv", sep = "/"), stringsAsFactors = F)
Identifiers <- read.csv(paste(dd, "Identifiers with SSN_final.csv", sep = "/"), stringsAsFactors = F)

wetChem <- read.csv(paste("/Users/mlowes/drive/JuPyteR/robert.on@oneacrefund.org/Rwanda Acidity", "Original chem_rwanda_shs.csv", sep = "/"))


d <- left_join(d, Identifiers, by="sample_id")

# now import the soil results and merge with survey data
soilRF <- read.csv(paste(dd, "rwshs_rfresults.csv", sep = "/"), stringsAsFactors = F)
names(soilRF)[names(soilRF)=="X"] <- "SSN"

d <- left_join(d, soilRF, by="SSN")

Cleaning baseline variables

Now let’s start cleaning the demographic variables

d[d=="---"] <- NA

names(d) <- gsub("text_final_questions", "", names(d))
names(d) <- gsub("intro_champ_echantillon", "", names(d))
names(d) <- gsub("demographic_info", "", names(d))
names(d) <- gsub("other_inputs_", "", names(d))
names(d) <- gsub("crop1_15b_inputs", "", names(d))
names(d) <- gsub("crop2_15b_inputs", "", names(d))
names(d) <- gsub("^15b", "", names(d))
names(d) <- gsub("historical_intro", "", names(d))

names(d)[names(d)=="field_dim"] <- "field_dim1"
names(d)[names(d)=="v51"] <- "field_dim2"

Address unusual field sizes

ggplot(d, aes(x=field_dim1, y=field_dim2)) + geom_point()
## Warning: Removed 19 rows containing missing values (geom_point).

It seems really unlikely that fields are 200 meters long while only being 20 meters wide. I don’t know how to check this though.

# clean field dimensions here - winsor the values to something reasonable.
d[(d$field_dim1>=100 | d$field_dim2>=100) & !is.na(d$field_dim1), c("field_dim1", "field_dim2")]
##      field_dim1 field_dim2
## 125          85        125
## 126         200         10
## 127          20        152
## 130         140        250
## 140          50        135
## 211          12        107
## 415           5        135
## 452         108         65
## 626         100         40
## 836          12        153
## 844          10        212
## 1153        150         20
## 1154        150        160
## 1155        100         40
## 1239        102         57
## 1241        112         13
## 1314        140          8
## 1345        104         24
## 1347        123          6
## 1378        120          9
## 1388        100          7
## 1389        200          4
## 1397        160          4
## 1399        150         50
## 1420        200         20
## 1421        200         18
## 1440         30        105
## 1457         29        101
## 1476        102         73
## 1477        108         62
## 1525         12        123
## 1539        100         95
## 1567        100          8
## 1709        100          6
## 1859        157          9
## 1976         15        120
## 1979         25        102
## 2074          7        100
## 2411         30        153
## 2452        203          8

Take care of demographic data formatting issues

# deal with names and drop unnecessary variables
d <- d %>% 
    dplyr::select(-c(rownumber, infoformid, introductiond_accept, photo,
        infocompleted_time, 
        enumerator_name, contains("phone"), farmer_name, farmersurname, farmername,
        d_respondent, additionalsamplepackedandsenttol, additionalsamplerequestedfromlab,
        datedryingcompleteifnecessary, driedindistrictifnecessary, senttohqyo,
        collectedindistrictyo, excessstoredathq_, receivedathq_,dateofinitialdryingifnecessary,
        samplecollectedinfieldyo, field_des, samplewetordry)) %>%
    rename(
    female = sex,
    age = age_cultivateur,
    own = d_own,
    client = d_client) %>%
    mutate(
    female= ifelse(female=="gore", 1,0),
    field.size = field_dim1*field_dim2
    )

d$total.seasons <- apply(d[, grep("d_season_list", names(d))], 1, function(x) {
    sum(x, na.rm=T)})

Clean agronomic practice variables

agVars <- c("n_season_fert", "n_season_compost", "n_season_lime", "n_season_fallow",
            "n_seasons_leg_1", "n_seasons_leg_2")
summary(d[,agVars])
##  n_season_fert    n_season_compost n_season_lime     n_season_fallow  
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.0000   Min.   : 0.0000  
##  1st Qu.: 0.000   1st Qu.: 2.000   1st Qu.: 0.0000   1st Qu.: 0.0000  
##  Median : 1.000   Median : 5.000   Median : 0.0000   Median : 0.0000  
##  Mean   : 2.041   Mean   : 5.651   Mean   : 0.1819   Mean   : 0.6355  
##  3rd Qu.: 3.000   3rd Qu.:10.000   3rd Qu.: 0.0000   3rd Qu.: 0.0000  
##  Max.   :10.000   Max.   :10.000   Max.   :10.0000   Max.   :10.0000  
##  NA's   :19       NA's   :19       NA's   :19        NA's   :19       
##  n_seasons_leg_1  n_seasons_leg_2   
##  Min.   : 0.000   Min.   : -88.000  
##  1st Qu.: 0.000   1st Qu.:   0.000  
##  Median : 1.000   Median :   2.000  
##  Mean   : 2.171   Mean   :   4.368  
##  3rd Qu.: 4.000   3rd Qu.:   5.000  
##  Max.   :10.000   Max.   :3333.000  
##  NA's   :19       NA's   :19

Sort out the legumes as a second crop

table(d$n_seasons_leg_2, useNA = 'ifany')
## 
##  -88    0    1    2    3    4    5    6    7    8    9   10   15   22   50 
##    1  956  221  258  174  130  301   61   34   66   57  199    1    1    1 
##   53   83   88  101  102 3333 <NA> 
##    3    1    1    1    1    1   19
d$n_seasons_leg_2 <- ifelse(d$n_seasons_leg_2 <0 | d$n_seasons_leg_2>10, NA, d$n_seasons_leg_2)

Check out client variable - why are there missing values?

table(d$client, useNA = 'ifany')
## 
##    0    1 <NA> 
## 1233 1236   19
d[is.na(d$client), c("sample_id")]
##  [1] "1189C" "1207C" "1295"  "1295C" "1298C" "1300"  "1300C" "1366C"
##  [9] "1476C" "1485C" "1554"  "1573"  "1614"  "2042"  "2371C" "2373" 
## [17] "2375"  "2382"  "2442"
# replace client based on whether there is a C in the client variable.
d$client.check <- ifelse(grepl("C", d$sample_id)==T, 0, 1)
table(d$client, d$client.check)
##    
##        0    1
##   0 1233    0
##   1    1 1235

It looks like most farmers were recorded correctly except for one farmer who was coded as Tubura farmer but their sample_id indicate their a control. Let’s take a look:

d[d$client==1 & d$client.check==0 & !is.na(d$d_gps), c("sample_id", "tuburacontroltc")]
##     sample_id tuburacontroltc
## 990     2202C         Control
# they should be a control
d[d$client==1 & d$client.check==0 & !is.na(d$d_gps), "client"] <- 0

# remove farmers for which we have soil data but no survey data (using)
d <- d[-which(grepl("using", d$X_merge)),]
table(d$client, d$client.check, useNA = 'ifany')
##    
##        0    1
##   0 1234    0
##   1    0 1235
# update client to equal client check
d$client <- d$client.check

Fix some more variable names:

names(d)[names(d)=="field_kg_fert1_1"] <- "field_kg_fert1_15b"
names(d)[names(d)=="field_kg_fert2_1"] <- "field_kg_fert2_15b"
names(d)[names(d)=="field_kg_compost"] <- "field_kg_compost_15b"

Recode variables to numeric:

# recode to numeric
varlist <- c("client", "own", "crop1_15b_seedkg", "crop1_15b_yield", "crop1_15b_yield_",
    "crop2_15b_seedkg", "crop2_15b_yield", "crop2_15b_yield_", "field_kg_fert1_15b",
    "field_kg_fert2_15b", "field_kg_compost_15b", "d_lime_15b", "kg_lime_15b")

# check that there aren't values hidden in the character variables
#apply(d[,varlist], 2, function(x){table(x, useNA='ifany')})

# recode characters to numerics
d[, varlist] <- sapply(d[,varlist], as.numeric)

Sort out kgs of lime applied

table(d$kg_lime_15b, useNA = 'ifany')
## 
##  -88    1    3    4    5   10   13   15   20   25   30   35   50   60   88 
##    1    2    2    2    6    5    1    1    2    5    1    1    8    1    2 
##  100  150  200 <NA> 
##    2    4    1 2422
d$kg_lime_15b <- ifelse(abs(d$kg_lime_15b)==88, NA, d$kg_lime_15b)
# divide out GPS coordinates
# http://rfunction.com/archives/1499

# replace the blank gps_pic_guide with info
d <- cbind(d, str_split_fixed(d$gps_pic_guid, " ", n=4))
names(d)[names(d)=="1" |names(d)== "2" | names(d)== "3" | names(d)== "4"] <- c("lat", "lon", "alt", "precision")
d[,c("lat", "lon", "alt", "precision")] <- sapply(d[,c("lat", "lon", "alt", "precision")],
                                                  function(x){as.numeric(as.character(x))})

Cleaning soil data

Cleaning of soil data: Come back, check and clean the soil data before outputting to clean data set. Plot each of the soil variables to look for unrealistic values.

dim(d[is.na(d$m3.Ca),])
## [1]  26 111
d <- d[-which(is.na(d$m3.Ca)),]
       
summary(d[,c("m3.Ca", "m3.Mg", "pH", "Total.N", "Total.C", "ExAl")])
##      m3.Ca            m3.Mg               pH           Total.N       
##  Min.   : 213.8   Min.   :  36.93   Min.   :4.549   Min.   :0.05723  
##  1st Qu.: 436.5   1st Qu.: 115.94   1st Qu.:5.020   1st Qu.:0.13577  
##  Median : 723.1   Median : 184.63   Median :5.535   Median :0.15483  
##  Mean   : 877.1   Mean   : 208.23   Mean   :5.543   Mean   :0.15706  
##  3rd Qu.:1159.5   3rd Qu.: 270.55   3rd Qu.:6.010   3rd Qu.:0.17910  
##  Max.   :3724.8   Max.   :1009.88   Max.   :7.131   Max.   :0.24656  
##     Total.C            ExAl        
##  Min.   :0.9107   Min.   :0.02589  
##  1st Qu.:1.7298   1st Qu.:0.09995  
##  Median :2.1164   Median :0.23170  
##  Mean   :2.1147   Mean   :0.57795  
##  3rd Qu.:2.3664   3rd Qu.:1.00947  
##  Max.   :4.2212   Max.   :2.31905

Let’s check out the rows for which we don’t have soil data and drop them as they won’t contribute to the full picture.

Graphs of RW baseline soil variables

soilVars <- c("m3.Ca", "m3.Mg", "pH", "Total.N", "Total.C", "ExAl")
for(i in 1:length(soilVars)){
print(
  ggplot(data=d, aes(x=as.factor(client), y=d[,soilVars[i]])) + 
    geom_boxplot() +
    labs(x="Tubura Farmer", y=soilVars[i], title = paste("RW baseline soil - ", soilVars[i], sep = ""))
  )  
}

Check out soil relationships

There are biologically predictable relationships between soil chemical characteristics. For instance, we expect Ca and Mg to move in the same direction and be positively correlated with pH. If we had Aluminum as an outcome, we’d expect pH to be negatively correlated with soluable aluminum. Let’s look quickly to confirm if those relationships are present:

ggplot(d, aes(x=m3.Ca, y=m3.Mg)) + geom_point() +
    stat_smooth(method="loess") +
    labs(x = "Calcium (m3)", y= "Magnesium (m3)", title="Calcium and Magnesium relationship")

ggplot(d, aes(x=pH, y=m3.Ca)) + geom_point() +
  stat_smooth(method="loess") +
  labs(x = "pH", y="Calcium (m3)", title = "pH and Calcium relationship")

ggplot(d, aes(x=pH, y=m3.Mg)) + geom_point() +
  stat_smooth(method="loess") +
  labs(x = "pH", y="Magnesium (m3)", title = "pH and Magnesium relationship")

ggplot(d, aes(x=pH, y=ExAl)) + geom_point() +
  stat_smooth(method="loess") +
  labs(x = "pH", y="Exchangeable Aluminum", title = "pH and Aluminum relationship")

ggplot(d, aes(x=Total.C, y=Total.N)) + geom_point() + 
  stat_smooth(method="loess") +
  labs(x = "Total Carbon", y="Total Nitrogen", title = "Carbon and Nitrogen relationship")

The soil characteristics are moving in the manner that is consistent with our understanding of soil chemical processes.

Save clean demographic and soil data to external file

write.csv(d, file=paste(dd, "shs rw baseline.csv", sep = "/"))
save(d, file=paste(dd, "shs rw baseline.Rdata", sep = "/"))

Map of baseline observations

Produce a simple map of where our observations are

See here for more on using markerClusterOptions in leaflet.

In the map below, the larger green circles are Tubura farmers and the smaller blue circles are control farmers.

e <- d[!is.na(d$lon),]
ss <- SpatialPointsDataFrame(coords = e[, c("lon", "lat")], data=e)

pal <- colorNumeric(c("navy", "green"), domain=unique(ss$client))
map <- leaflet() %>% addTiles() %>%
  setView(lng=rwanda$longitude, lat=rwanda$latitude, zoom=8) %>%
  addCircleMarkers(lng=ss$lon, lat=ss$lat, 
                   radius= ifelse(ss$client==1, 10,6),
                   color = pal(ss$client),
clusterOptions = markerClusterOptions(disableClusteringAtZoom=13, spiderfyOnMaxZoom=FALSE))

map

Summary statistics

Table of final baseline breakdown

count <- d %>% group_by(district) %>% 
  dplyr::summarize(
    t.count = sum(ifelse(client==1,1,0)),
    c.count = sum(ifelse(client==0,1,0)),
    total = n()
  ) %>% ungroup()

count <- as.data.frame(count)
write.csv(count, file=paste(od, "final rw sample breakdown.csv", sep="/"), row.names=F)
as.data.frame(count)
##        district t.count c.count total
## 1   Gatsibo_LWH      58      59   117
## 2  Gatsibo_NLWH      87      81   168
## 3      Gisagara      46      46    92
## 4          Huye     114     115   229
## 5       Karongi     163     159   322
## 6       Kayonza      55      58   113
## 7      Mugonero     131     129   260
## 8     Nyamagabe      56      56   112
## 9    Nyamasheke     147     146   293
## 10       Nyanza      46      46    92
## 11    Nyaruguru      46      46    92
## 12      Rutsiro     165     167   332
## 13    Rwamagana     110     111   221

Baseline balance

Let’s see how balanced our farmers are in terms of demographic variables. Tubura farmers were selected based on (list criteria) and control farmers in the same area tha fit the same criteria were also selected. No matching process has been performed to identify the control farmers that most closely resemble the Tubura farmers in the sample. These results are entirely reflecting the balance inherent in the identification process, not any statistical matching of treatment and control.

d$valley <- ifelse(d$general_field_infofield_location=="valley", 1,0)
d$hillside <- ifelse(d$general_field_infofield_location=="hillside", 1,0)
d$hilltop <- ifelse(d$general_field_infofield_location=="hilltop", 1,0)
names(d)[names(d)=="general_field_infograde_hill"] <- "slope"
cor(d[, grep("betail_", names(d))], use='complete.obs')
##                        betail_ownedn_inka betail_ownedn_ihene
## betail_ownedn_inka             1.00000000         0.036402152
## betail_ownedn_ihene            0.03640215         1.000000000
## betail_ownedn_inkoko           0.09740274         0.127424693
## betail_ownedn_ingurube         0.02166420        -0.009667001
## betail_ownedn_intama           0.04245052         0.002625236
##                        betail_ownedn_inkoko betail_ownedn_ingurube
## betail_ownedn_inka               0.09740274            0.021664199
## betail_ownedn_ihene              0.12742469           -0.009667001
## betail_ownedn_inkoko             1.00000000            0.049416370
## betail_ownedn_ingurube           0.04941637            1.000000000
## betail_ownedn_intama             0.03041599            0.025487680
##                        betail_ownedn_intama
## betail_ownedn_inka              0.042450521
## betail_ownedn_ihene             0.002625236
## betail_ownedn_inkoko            0.030415986
## betail_ownedn_ingurube          0.025487680
## betail_ownedn_intama            1.000000000
names(d)[grep("betail_", names(d))] <- c("cows", "goats", "chickens", "pigs", "sheep")

d$own.cows <- ifelse(d$cows>0 & !is.na(d$cows), 1,0)
d$own.goats <- ifelse(d$goats>0 & !is.na(d$goats), 1,0)
d$own.chickens <- ifelse(d$chickens>0 & !is.na(d$chickens), 1,0)
d$own.pigs <- ifelse(d$pigs>0 & !is.na(d$pigs), 1,0)
d$own.sheep <- ifelse(d$sheep>0 & !is.na(d$sheep), 1,0)

Crop selection in 15B

names(d)[names(d)=="inputuse_15b_priorculture_15b_1"] <-"crop_15b"
d$crop_15b <- tolower(d$crop_15b)
sort(prop.table(table(d$crop_15b, useNA = 'ifany')))
## 
##          kor         hwag        sheke       coffee          cer 
## 0.0004093328 0.0008186656 0.0008186656 0.0012279984 0.0024559967 
##        uburo         teke       insina         shaz          nyo 
## 0.0036839951 0.0040933279 0.0057306590 0.0085959885 0.0135079820 
##          gan    other_veg          ray          yum       ntacyo 
## 0.0196479738 0.0212853050 0.0519852640 0.0577159230 0.0589439214 
##          gor         soya          jum          big         saka 
## 0.0597625870 0.0790012280 0.0920998772 0.1604584527 0.1649611134 
##          shy 
## 0.1927957429
d$climbing_bean <- ifelse(d$crop_15b=="shy", 1,0)
d$sorghum <- ifelse(d$crop_15b=="saka", 1,0)
d$bush_bean <- ifelse(d$crop_15b=="big", 1,0)
d$maize <- ifelse(d$crop_15b=="gor", 1,0)
out.list <- c("female", "age", "hhsize", "own", "field.size",
              "n_season_fert", "n_season_compost", "n_season_lime", "n_season_fallow", "n_seasons_leg_1", "n_seasons_leg_2", "slope",  "alt", "valley", "hillside", "hilltop", "cows", "goats", "chickens", "pigs", "sheep", "climbing_bean", "sorghum", "bush_bean","maize", soilVars, "own.cows", "own.goats", "own.chickens", "own.pigs", "own.sheep")

output <- do.call(rbind, lapply(out.list, function(x) {
  
  out <- t.test(d[,x] ~ d[,"client"], data=d)
  tab <- data.frame(out[[5]][[2]],out[[5]][[1]], out[3])
  tab[,1:2] <- round(tab[,1:2],3)
  names(tab) <- c(names(out[[5]]), "pvalue")
  return(tab)
}))

# use p.adjust with bonferroni correction
output$pvalue <- p.adjust(output$pvalue, method="fdr")
rownames(output) <- out.list
output <- output[order(output$pvalue),]
output$pvalue <- ifelse(output[, 3] < 0.001, "< 0.001", round(output[, 3], 3)) 


colnames(output) <- c("Tubura Client","Non-Tubura", "p-value")  
print(kable(output))
Tubura Client Non-Tubura p-value
n_season_fert 3.263 0.833 < 0.001
female 0.493 0.649 < 0.001
own.cows 0.683 0.539 < 0.001
age 44.021 48.088 < 0.001
hhsize 5.418 4.894 < 0.001
own.chickens 0.342 0.267 < 0.001
n_seasons_leg_2 2.550 3.072 < 0.001
n_season_lime 0.228 0.132 < 0.001
own.pigs 0.355 0.281 < 0.001
pigs 0.525 0.394 0.006
chickens 1.225 0.895 0.006
hillside 0.682 0.733 0.016
own 0.932 0.957 0.023
cows 1.178 0.912 0.023
valley 0.266 0.222 0.025
maize 0.072 0.048 0.025
sorghum 0.146 0.184 0.026
climbing_bean 0.212 0.174 0.036
own.sheep 0.104 0.077 0.041
sheep 0.202 0.142 0.049
n_season_compost 5.818 5.523 0.095
Total.N 0.156 0.158 0.102
n_season_fallow 0.690 0.569 0.103
m3.Mg 204.339 212.129 0.171
m3.Ca 859.917 894.354 0.181
field.size 667.734 601.671 0.214
Total.C 2.104 2.126 0.393
bush_bean 0.154 0.167 0.456
n_seasons_leg_1 2.228 2.135 0.472
hilltop 0.051 0.044 0.472
pH 5.533 5.553 0.472
slope 10.203 10.445 0.547
ExAl 0.584 0.571 0.654
goats 1.076 1.053 0.768
alt 1570.267 1565.917 0.863
own.goats 0.485 0.485 0.981
#write table
write.csv(output, file=paste(od, "baseline balance.csv", sep="/"), row.names=T)
# write baseline balance table for ES per his table layouts
t4order <- c("age", "female", "hhsize", "own")
table4vars <- paste(t4order, collapse="|")

rw.table4 <- output[grepl(table4vars, rownames(output)),]
rw.table4 <- rw.table4[order(match(rownames(rw.table4), t4order)),]
write.csv(rw.table4, file=paste(od, "pre match balance table4.csv", sep="/"),
          row.names = T)

t5order <- c("field.size", "slope", "alt", "hilltop", "hillside", "valley",
                      "climbing_bean", "sorghum", "bush_bean", "maize")

table5vars <- paste(t5order,collapse="|")

rw.table5 <- output[grepl(table5vars, rownames(output)),]
rw.table5 <- rw.table5[order(match(rownames(rw.table5), t5order)),]
write.csv(rw.table5, file=paste(od, "pre match balance table5.csv", sep = "/"),
          row.names = T)

t6order <- c("own.cows", "cows", "own.pigs", "pigs", "own.sheep", "sheep", "own.goats", "goats","own.chickens", "chickens")
table6vars <- paste(t6order, collapse = "|")
rw.table6 <- output[grep(table6vars, rownames(output)), ]
rw.table6 <- rw.table6[order(match(rownames(rw.table6), t6order)),]
write.csv(rw.table6, file=paste(od, "pre match balance table6.csv", sep = "/"), 
          row.names=T)

Overall balance interpretation

Demographic variables We are not well balanced along the main demographic variables we collected, sex, age and HH size. For the purposes of inference we can test some matching algorithms to improve the match between Tubura and control farmers.

Agricultural practice variables We are decently balanced along agricultural practice variables. Our course of action here is similiar to our options with the demographic variables.

Soil Variables We are balanced on the primary soil variables of interest betwen our Tubura farmers and the comparison farmers.

Baseline balance by district

dist.output <- do.call(rbind, lapply(split(d, d$district), function(x) {
  
  tab <- do.call(rbind, lapply(out.list, function(y) {
    
    out <- t.test(x[,y] ~ x[,"client"], data=x)
    tab <- data.frame(out[[5]][[2]],out[[5]][[1]], out[3])
    tab[,1:2] <- round(tab[,1:2],3)
    names(tab) <- c(names(out[[5]]), "pvalue")
    #tab[,3] <- p.adjust(tab[,3], method="holm")
    #tab[,3] <- ifelse(tab[,3] < 0.001, "< 0.001", round(tab[,3],3))
    #print(tab)
    return(tab)
  }))
  
  return(data.frame(district = unique(x$district), tab))
}))

rownames(dist.output) <- NULL
dist.output$variable <- rep(out.list,length(unique(d$district)))    

# order variables 
dist.output <- dist.output[, c(1, 5, 2:4)]
dist.output$pvalue <- p.adjust(dist.output$pvalue, method="fdr")
dist.output <- dist.output[order(dist.output$pvalue),]

dist.output$pvalue <- ifelse(dist.output$pvalue < 0.001, "< 0.001", round(dist.output$pvalue,3))
colnames(dist.output) <- c("District", "Varible", "Tubura Client","Non-Tubura", "p-value")  
print(kable(dist.output))
District Varible Tubura Client Non-Tubura p-value
150 Karongi n_season_fert 3.785 0.447 < 0.001
222 Mugonero n_season_fert 4.313 0.597 < 0.001
402 Rutsiro n_season_fert 4.200 1.066 < 0.001
294 Nyamasheke n_season_fert 5.068 1.507 < 0.001
438 Rwamagana n_season_fert 2.645 1.054 < 0.001
114 Huye n_season_fert 2.246 0.322 < 0.001
42 Gatsibo_NLWH n_season_fert 1.517 0.296 < 0.001
428 Rutsiro own.cows 0.824 0.545 < 0.001
258 Nyamagabe n_season_fert 4.000 1.554 < 0.001
122 Huye valley 0.254 0.052 < 0.001
123 Huye hillside 0.702 0.922 < 0.001
296 Nyamasheke n_season_lime 0.327 0.041 < 0.001
186 Kayonza n_season_fert 2.073 0.776 0.001
434 Rwamagana age 44.245 52.072 0.002
291 Nyamasheke hhsize 5.701 4.685 0.005
145 Karongi female 0.466 0.667 0.007
363 Nyaruguru hhsize 6.217 4.717 0.007
104 Gisagara own.cows 0.696 0.326 0.007
106 Gisagara own.chickens 0.370 0.065 0.009
107 Gisagara own.pigs 0.565 0.217 0.011
110 Huye age 43.395 49.400 0.015
89 Gisagara cows 1.043 0.413 0.016
99 Gisagara m3.Mg 191.739 239.872 0.02
325 Nyanza female 0.217 0.543 0.021
98 Gisagara m3.Ca 903.287 1203.933 0.021
330 Nyanza n_season_fert 1.326 0.152 0.03
38 Gatsibo_NLWH age 39.897 46.210 0.032
37 Gatsibo_NLWH female 0.322 0.556 0.033
91 Gisagara chickens 1.196 0.174 0.033
466 Rwamagana own.chickens 0.418 0.225 0.033
2 Gatsibo_LWH age 41.931 49.949 0.042
253 Nyamagabe female 0.482 0.750 0.047
181 Kayonza female 0.400 0.672 0.048
140 Huye own.cows 0.675 0.487 0.05
40 Gatsibo_NLWH own 0.908 1.000 0.051
224 Mugonero n_season_lime 0.176 0.023 0.051
433 Rwamagana female 0.555 0.739 0.051
1 Gatsibo_LWH female 0.241 0.492 0.056
399 Rutsiro hhsize 5.624 4.844 0.056
310 Nyamasheke climbing_bean 0.422 0.267 0.06
284 Nyamagabe own.cows 0.732 0.482 0.071
313 Nyamasheke maize 0.082 0.014 0.071
169 Karongi maize 0.086 0.019 0.072
219 Mugonero hhsize 5.802 5.054 0.074
59 Gatsibo_NLWH sorghum 0.276 0.469 0.098
218 Mugonero age 44.290 49.140 0.1
101 Gisagara Total.N 0.142 0.155 0.116
125 Huye cows 1.167 0.774 0.116
361 Nyaruguru female 0.391 0.652 0.116
300 Nyamasheke slope 14.129 17.199 0.123
404 Rutsiro n_season_lime 0.327 0.096 0.123
102 Gisagara Total.C 1.815 2.005 0.124
295 Nyamasheke n_season_compost 6.544 5.521 0.131
338 Nyanza valley 0.283 0.087 0.131
339 Nyanza hillside 0.717 0.913 0.131
298 Nyamasheke n_seasons_leg_1 2.327 1.658 0.149
464 Rwamagana own.cows 0.555 0.396 0.149
238 Mugonero climbing_bean 0.321 0.194 0.152
366 Nyaruguru n_season_fert 2.130 1.065 0.152
92 Gisagara pigs 0.609 0.304 0.159
128 Huye pigs 0.965 0.748 0.159
75 Gisagara hhsize 5.348 4.326 0.162
6 Gatsibo_LWH n_season_fert 2.552 1.475 0.164
119 Huye n_seasons_leg_2 3.667 4.860 0.164
131 Huye sorghum 0.237 0.374 0.164
336 Nyanza slope 7.435 9.239 0.164
396 Nyaruguru own.sheep 0.109 0.000 0.164
455 Rwamagana sorghum 0.191 0.324 0.164
143 Huye own.pigs 0.754 0.617 0.17
417 Rutsiro sheep 0.467 0.246 0.177
111 Huye hhsize 5.132 4.574 0.187
255 Nyamagabe hhsize 5.482 4.696 0.187
309 Nyamasheke sheep 0.082 0.014 0.197
271 Nyamagabe chickens 0.857 0.304 0.202
430 Rutsiro own.chickens 0.291 0.192 0.212
286 Nyamagabe own.chickens 0.286 0.125 0.216
408 Rutsiro slope 15.521 13.958 0.22
179 Karongi own.pigs 0.399 0.289 0.225
192 Kayonza slope 3.327 1.690 0.225
81 Gisagara n_season_fallow 0.500 0.087 0.226
83 Gisagara n_seasons_leg_2 2.370 3.283 0.226
227 Mugonero n_seasons_leg_2 3.131 4.031 0.23
56 Gatsibo_NLWH pigs 0.241 0.062 0.236
100 Gisagara pH 5.759 5.950 0.236
328 Nyanza own 0.913 1.000 0.238
436 Rwamagana own 0.909 0.973 0.238
103 Gisagara ExAl 0.337 0.191 0.254
412 Rutsiro hilltop 0.097 0.042 0.254
449 Rwamagana cows 1.082 0.514 0.254
467 Rwamagana own.pigs 0.282 0.171 0.254
451 Rwamagana chickens 1.609 0.955 0.26
47 Gatsibo_NLWH n_seasons_leg_2 3.046 3.889 0.262
78 Gisagara n_season_fert 1.087 0.348 0.262
324 Nyamasheke own.sheep 0.054 0.014 0.269
335 Nyanza n_seasons_leg_2 2.130 3.478 0.269
152 Karongi n_season_lime 0.098 0.013 0.271
188 Kayonza n_season_lime 0.527 0.707 0.273
343 Nyanza chickens 2.348 1.174 0.273
441 Rwamagana n_season_fallow 1.218 0.712 0.273
381 Nyaruguru sheep 0.196 0.000 0.274
90 Gisagara goats 1.522 0.935 0.275
254 Nyamagabe age 43.768 48.875 0.275
160 Karongi hilltop 0.067 0.025 0.315
86 Gisagara valley 0.283 0.130 0.315
273 Nyamagabe sheep 0.286 0.089 0.315
377 Nyaruguru cows 1.196 0.891 0.315
389 Nyaruguru Total.N 0.161 0.170 0.315
323 Nyamasheke own.pigs 0.408 0.308 0.318
137 Huye Total.N 0.142 0.148 0.327
161 Karongi cows 1.387 1.119 0.348
355 Nyanza ExAl 0.117 0.175 0.358
52 Gatsibo_NLWH hilltop 0.011 0.062 0.358
133 Huye maize 0.061 0.017 0.358
201 Kayonza sheep 0.036 0.172 0.358
326 Nyanza age 44.717 50.152 0.372
392 Nyaruguru own.cows 0.826 0.674 0.372
73 Gisagara female 0.435 0.609 0.377
395 Nyaruguru own.pigs 0.609 0.435 0.377
146 Karongi age 44.681 47.384 0.379
210 Kayonza Total.C 2.395 2.552 0.382
380 Nyaruguru pigs 0.783 0.457 0.382
459 Rwamagana m3.Mg 270.948 289.488 0.384
248 Mugonero own.cows 0.786 0.698 0.387
182 Kayonza age 42.345 46.500 0.389
403 Rutsiro n_season_compost 8.176 7.593 0.417
43 Gatsibo_NLWH n_season_compost 2.632 3.420 0.417
33 Gatsibo_LWH own.goats 0.552 0.407 0.429
206 Kayonza m3.Ca 1677.807 1878.700 0.432
348 Nyanza bush_bean 0.261 0.413 0.446
168 Karongi bush_bean 0.080 0.132 0.454
413 Rutsiro cows 1.442 1.048 0.458
394 Nyaruguru own.chickens 0.283 0.152 0.46
242 Mugonero m3.Ca 551.499 603.770 0.462
405 Rutsiro n_season_fallow 0.473 0.287 0.485
299 Nyamasheke n_seasons_leg_2 2.137 2.681 0.488
71 Gatsibo_NLWH own.pigs 0.126 0.062 0.499
144 Huye own.sheep 0.053 0.017 0.499
272 Nyamagabe pigs 1.000 0.804 0.499
391 Nyaruguru ExAl 0.717 0.882 0.499
32 Gatsibo_LWH own.cows 0.500 0.373 0.5
96 Gisagara bush_bean 0.217 0.348 0.5
97 Gisagara maize 0.043 0.000 0.5
109 Huye female 0.500 0.591 0.5
184 Kayonza own 0.964 0.897 0.5
185 Kayonza field.size 664.236 423.776 0.5
216 Kayonza own.sheep 0.036 0.103 0.5
232 Mugonero hilltop 0.015 0.000 0.5
262 Nyamagabe n_seasons_leg_1 1.625 1.125 0.5
276 Nyamagabe bush_bean 0.000 0.036 0.5
288 Nyamagabe own.sheep 0.179 0.089 0.5
352 Nyanza pH 6.104 5.995 0.5
368 Nyaruguru n_season_lime 0.043 0.000 0.5
388 Nyaruguru pH 5.320 5.205 0.5
397 Rutsiro female 0.545 0.623 0.5
452 Rwamagana pigs 0.518 0.315 0.5
44 Gatsibo_NLWH n_season_lime 0.069 0.025 0.51
17 Gatsibo_LWH cows 0.759 0.542 0.514
49 Gatsibo_NLWH alt 1017.040 1129.172 0.514
55 Gatsibo_NLWH chickens 1.678 1.000 0.514
162 Karongi goats 0.890 1.069 0.514
209 Kayonza Total.N 0.180 0.187 0.514
344 Nyanza pigs 0.087 0.217 0.514
359 Nyanza own.pigs 0.065 0.152 0.514
364 Nyaruguru own 0.848 0.935 0.514
462 Rwamagana Total.C 2.174 2.236 0.514
205 Kayonza maize 0.018 0.069 0.514
432 Rutsiro own.sheep 0.212 0.156 0.514
424 Rutsiro pH 5.269 5.339 0.518
87 Gisagara hillside 0.587 0.717 0.524
151 Karongi n_season_compost 7.074 6.553 0.524
199 Kayonza chickens 0.982 0.466 0.544
422 Rutsiro m3.Ca 628.725 682.389 0.549
290 Nyamasheke age 45.898 48.123 0.551
356 Nyanza own.cows 0.652 0.522 0.551
105 Gisagara own.goats 0.630 0.500 0.552
148 Karongi own 0.957 0.981 0.552
207 Kayonza m3.Mg 331.923 356.976 0.552
244 Mugonero pH 5.180 5.243 0.552
247 Mugonero ExAl 0.925 0.823 0.552
320 Nyamasheke own.cows 0.639 0.568 0.552
350 Nyanza m3.Ca 1086.559 992.934 0.552
11 Gatsibo_LWH n_seasons_leg_2 1.983 2.690 0.556
18 Gatsibo_LWH goats 1.345 0.983 0.562
118 Huye n_seasons_leg_1 1.386 1.043 0.562
194 Kayonza valley 0.545 0.431 0.562
351 Nyanza m3.Mg 233.315 213.344 0.562
19 Gatsibo_LWH chickens 1.052 0.678 0.563
127 Huye chickens 1.772 1.348 0.563
409 Rutsiro alt 1945.122 1894.425 0.563
443 Rwamagana n_seasons_leg_2 1.349 1.685 0.563
147 Karongi hhsize 5.172 4.912 0.568
159 Karongi hillside 0.785 0.836 0.568
203 Kayonza sorghum 0.182 0.103 0.568
387 Nyaruguru m3.Mg 152.908 139.740 0.568
390 Nyaruguru Total.C 2.145 2.237 0.568
414 Rutsiro goats 0.515 0.659 0.568
277 Nyamagabe maize 0.089 0.036 0.569
426 Rutsiro Total.C 2.247 2.309 0.569
444 Rwamagana slope 3.682 4.234 0.569
429 Rutsiro own.goats 0.273 0.329 0.602
316 Nyamasheke pH 5.176 5.120 0.605
204 Kayonza bush_bean 0.218 0.138 0.614
61 Gatsibo_NLWH maize 0.092 0.049 0.632
250 Mugonero own.chickens 0.382 0.318 0.632
427 Rutsiro ExAl 0.814 0.736 0.632
453 Rwamagana sheep 0.509 0.360 0.636
3 Gatsibo_LWH hhsize 5.362 5.763 0.641
4 Gatsibo_LWH own 0.983 1.000 0.641
54 Gatsibo_NLWH goats 1.218 1.654 0.641
68 Gatsibo_NLWH own.cows 0.563 0.481 0.641
82 Gisagara n_seasons_leg_1 2.304 2.870 0.641
112 Huye own 0.974 0.991 0.641
116 Huye n_season_lime 0.035 0.113 0.641
126 Huye goats 0.965 0.809 0.641
174 Karongi Total.C 1.887 1.839 0.641
198 Kayonza goats 1.727 1.397 0.641
202 Kayonza climbing_bean 0.000 0.017 0.641
220 Mugonero own 0.947 0.915 0.641
241 Mugonero maize 0.000 0.008 0.641
269 Nyamagabe cows 0.911 0.714 0.641
319 Nyamasheke ExAl 0.983 1.062 0.641
349 Nyanza maize 0.130 0.065 0.641
358 Nyanza own.chickens 0.543 0.435 0.641
365 Nyaruguru field.size 570.022 495.239 0.641
384 Nyaruguru bush_bean 0.065 0.022 0.641
385 Nyaruguru maize 0.022 0.000 0.641
401 Rutsiro field.size 584.327 430.156 0.641
416 Rutsiro pigs 0.315 0.240 0.641
457 Rwamagana maize 0.027 0.009 0.641
458 Rwamagana m3.Ca 1303.126 1380.293 0.641
461 Rwamagana Total.N 0.175 0.179 0.641
217 Mugonero female 0.656 0.713 0.644
341 Nyanza cows 1.174 0.891 0.644
50 Gatsibo_NLWH valley 0.759 0.691 0.646
166 Karongi climbing_bean 0.405 0.352 0.646
233 Mugonero cows 1.328 1.171 0.646
347 Nyanza sorghum 0.283 0.196 0.646
410 Rutsiro valley 0.152 0.192 0.646
196 Kayonza hilltop 0.109 0.172 0.646
53 Gatsibo_NLWH cows 0.885 0.741 0.651
301 Nyamasheke alt 1678.150 1704.388 0.651
423 Rutsiro m3.Mg 165.214 175.386 0.651
213 Kayonza own.goats 0.673 0.586 0.652
8 Gatsibo_LWH n_season_lime 0.500 0.322 0.653
77 Gisagara field.size 493.957 559.261 0.661
132 Huye bush_bean 0.193 0.243 0.661
281 Nyamagabe Total.N 0.169 0.165 0.661
379 Nyaruguru chickens 0.652 0.391 0.661
70 Gatsibo_NLWH own.chickens 0.310 0.247 0.665
421 Rutsiro maize 0.176 0.216 0.665
5 Gatsibo_LWH field.size 776.345 916.441 0.688
85 Gisagara alt 1036.055 923.525 0.688
282 Nyamagabe Total.C 2.345 2.274 0.688
337 Nyanza alt 1505.653 1529.554 0.688
208 Kayonza pH 6.132 6.204 0.689
221 Mugonero field.size 441.359 393.124 0.689
425 Rutsiro Total.N 0.157 0.159 0.689
468 Rwamagana own.sheep 0.227 0.180 0.689
34 Gatsibo_LWH own.chickens 0.345 0.271 0.694
141 Huye own.goats 0.518 0.461 0.694
45 Gatsibo_NLWH n_season_fallow 0.736 0.543 0.695
308 Nyamasheke pigs 0.776 0.582 0.695
393 Nyaruguru own.goats 0.348 0.435 0.695
357 Nyanza own.goats 0.630 0.543 0.697
375 Nyaruguru hillside 0.957 0.913 0.697
376 Nyaruguru hilltop 0.043 0.087 0.697
95 Gisagara sorghum 0.435 0.522 0.699
270 Nyamagabe goats 0.750 0.607 0.699
289 Nyamasheke female 0.646 0.692 0.699
155 Karongi n_seasons_leg_2 3.216 3.548 0.701
287 Nyamagabe own.pigs 0.732 0.661 0.704
60 Gatsibo_NLWH bush_bean 0.172 0.222 0.704
117 Huye n_season_fallow 0.675 0.487 0.704
293 Nyamasheke field.size 787.527 580.130 0.704
327 Nyanza hhsize 5.261 4.891 0.704
386 Nyaruguru m3.Ca 660.083 610.274 0.704
236 Mugonero pigs 0.412 0.333 0.709
212 Kayonza own.cows 0.382 0.310 0.709
177 Karongi own.goats 0.497 0.541 0.71
251 Mugonero own.pigs 0.244 0.287 0.718
373 Nyaruguru alt 1791.131 1814.738 0.718
257 Nyamagabe field.size 310.929 282.643 0.721
74 Gisagara age 44.761 46.848 0.726
252 Mugonero own.sheep 0.122 0.093 0.726
456 Rwamagana bush_bean 0.364 0.315 0.726
243 Mugonero m3.Mg 160.377 170.165 0.732
10 Gatsibo_LWH n_seasons_leg_1 4.345 4.678 0.741
23 Gatsibo_LWH sorghum 0.276 0.339 0.741
26 Gatsibo_LWH m3.Ca 1226.445 1149.970 0.741
124 Huye hilltop 0.044 0.026 0.741
149 Karongi field.size 490.788 446.088 0.741
164 Karongi pigs 0.528 0.447 0.741
175 Karongi ExAl 0.647 0.597 0.745
200 Kayonza pigs 0.309 0.224 0.746
129 Huye sheep 0.061 0.035 0.747
460 Rwamagana pH 5.817 5.862 0.756
41 Gatsibo_NLWH field.size 1306.069 1459.901 0.764
115 Huye n_season_compost 5.211 4.878 0.764
24 Gatsibo_LWH bush_bean 0.397 0.339 0.779
57 Gatsibo_NLWH sheep 0.046 0.025 0.779
69 Gatsibo_NLWH own.goats 0.517 0.568 0.779
171 Karongi m3.Mg 254.160 267.784 0.779
173 Karongi Total.N 0.144 0.142 0.779
176 Karongi own.cows 0.767 0.736 0.779
190 Kayonza n_seasons_leg_1 1.982 2.155 0.779
317 Nyamasheke Total.N 0.165 0.167 0.779
333 Nyanza n_season_fallow 1.130 0.870 0.779
400 Rutsiro own 0.915 0.934 0.779
20 Gatsibo_LWH pigs 0.138 0.203 0.782
79 Gisagara n_season_compost 4.326 3.957 0.782
214 Kayonza own.chickens 0.182 0.138 0.782
172 Karongi pH 5.442 5.477 0.785
439 Rwamagana n_season_compost 4.018 3.748 0.785
36 Gatsibo_LWH own.sheep 0.034 0.017 0.792
39 Gatsibo_NLWH hhsize 5.333 5.160 0.792
191 Kayonza n_seasons_leg_2 1.091 1.259 0.792
215 Kayonza own.pigs 0.218 0.172 0.792
226 Mugonero n_seasons_leg_1 1.336 1.186 0.792
256 Nyamagabe own 0.964 0.982 0.792
345 Nyanza sheep 0.022 0.043 0.792
346 Nyanza climbing_bean 0.043 0.022 0.792
360 Nyanza own.sheep 0.022 0.043 0.792
369 Nyaruguru n_season_fallow 0.370 0.261 0.792
378 Nyaruguru goats 0.609 0.739 0.792
446 Rwamagana valley 0.409 0.369 0.792
447 Rwamagana hillside 0.527 0.568 0.792
450 Rwamagana goats 1.745 1.568 0.792
76 Gisagara own 0.826 0.870 0.793
9 Gatsibo_LWH n_season_fallow 0.397 0.288 0.796
259 Nyamagabe n_season_compost 7.857 7.518 0.796
285 Nyamagabe own.goats 0.464 0.411 0.796
398 Rutsiro age 43.685 44.617 0.796
195 Kayonza hillside 0.345 0.397 0.797
84 Gisagara slope 8.391 7.739 0.802
315 Nyamasheke m3.Mg 156.068 149.714 0.802
260 Nyamagabe n_season_lime 0.500 0.393 0.803
283 Nyamagabe ExAl 1.111 1.170 0.808
134 Huye m3.Ca 850.319 872.728 0.815
135 Huye m3.Mg 187.671 191.878 0.815
63 Gatsibo_NLWH m3.Mg 246.996 255.132 0.818
249 Mugonero own.goats 0.542 0.574 0.818
265 Nyamagabe alt 2124.389 2109.814 0.818
113 Huye field.size 599.096 567.922 0.821
263 Nyamagabe n_seasons_leg_2 3.161 2.929 0.821
278 Nyamagabe m3.Ca 458.692 440.797 0.821
382 Nyaruguru climbing_bean 0.196 0.239 0.821
314 Nyamasheke m3.Ca 614.196 590.190 0.84
180 Karongi own.sheep 0.080 0.094 0.84
228 Mugonero slope 15.939 16.473 0.84
292 Nyamasheke own 0.932 0.945 0.84
367 Nyaruguru n_season_compost 5.696 6.043 0.84
465 Rwamagana own.goats 0.536 0.568 0.84
142 Huye own.chickens 0.421 0.391 0.843
275 Nyamagabe sorghum 0.036 0.054 0.843
329 Nyanza field.size 853.359 766.217 0.843
235 Mugonero chickens 1.076 0.961 0.844
370 Nyaruguru n_seasons_leg_1 2.043 2.348 0.844
334 Nyanza n_seasons_leg_1 2.543 2.283 0.856
178 Karongi own.chickens 0.411 0.434 0.863
437 Rwamagana field.size 892.855 842.054 0.863
442 Rwamagana n_seasons_leg_1 1.682 1.568 0.863
312 Nyamasheke bush_bean 0.088 0.075 0.867
163 Karongi chickens 1.503 1.377 0.87
440 Rwamagana n_season_lime 0.355 0.324 0.87
454 Rwamagana climbing_bean 0.036 0.027 0.872
72 Gatsibo_NLWH own.sheep 0.034 0.025 0.89
306 Nyamasheke goats 0.918 0.863 0.89
165 Karongi sheep 0.166 0.195 0.902
13 Gatsibo_LWH alt 1094.532 1126.650 0.912
136 Huye pH 5.681 5.664 0.912
223 Mugonero n_season_compost 6.229 6.070 0.912
297 Nyamasheke n_season_fallow 0.599 0.664 0.912
431 Rutsiro own.pigs 0.194 0.180 0.912
35 Gatsibo_LWH own.pigs 0.121 0.102 0.916
411 Rutsiro hillside 0.752 0.766 0.919
231 Mugonero hillside 0.855 0.868 0.925
31 Gatsibo_LWH ExAl 0.151 0.164 0.927
28 Gatsibo_LWH pH 6.139 6.115 0.928
29 Gatsibo_LWH Total.N 0.148 0.147 0.928
30 Gatsibo_LWH Total.C 1.882 1.856 0.928
88 Gisagara hilltop 0.130 0.152 0.928
120 Huye slope 6.658 6.817 0.928
266 Nyamagabe valley 0.125 0.107 0.928
121 Huye alt 1647.892 1655.403 0.93
318 Nyamasheke Total.C 2.307 2.289 0.93
371 Nyaruguru n_seasons_leg_2 3.178 3.370 0.93
445 Rwamagana alt 982.728 961.878 0.93
51 Gatsibo_NLWH hillside 0.230 0.247 0.934
64 Gatsibo_NLWH pH 6.037 6.053 0.934
170 Karongi m3.Ca 781.863 796.295 0.934
274 Nyamagabe climbing_bean 0.161 0.143 0.934
280 Nyamagabe pH 4.999 5.013 0.934
267 Nyamagabe hillside 0.821 0.839 0.938
406 Rutsiro n_seasons_leg_1 3.388 3.479 0.938
158 Karongi valley 0.147 0.138 0.944
197 Kayonza cows 0.727 0.828 0.944
225 Mugonero n_season_fallow 0.863 0.915 0.944
239 Mugonero sorghum 0.061 0.054 0.944
353 Nyanza Total.N 0.134 0.135 0.944
27 Gatsibo_LWH m3.Mg 238.242 234.655 0.946
130 Huye climbing_bean 0.088 0.096 0.946
211 Kayonza ExAl 0.111 0.117 0.946
264 Nyamagabe slope 12.571 12.839 0.946
279 Nyamagabe m3.Mg 109.343 111.050 0.946
307 Nyamasheke chickens 0.599 0.562 0.946
463 Rwamagana ExAl 0.257 0.249 0.946
67 Gatsibo_NLWH ExAl 0.182 0.175 0.947
261 Nyamagabe n_season_fallow 0.268 0.304 0.947
354 Nyanza Total.C 1.766 1.782 0.947
139 Huye ExAl 0.307 0.315 0.954
435 Rwamagana hhsize 4.927 4.982 0.954
415 Rutsiro chickens 0.879 0.958 0.955
138 Huye Total.C 1.917 1.925 0.956
62 Gatsibo_NLWH m3.Ca 1236.151 1251.746 0.956
183 Kayonza hhsize 5.091 5.155 0.956
21 Gatsibo_LWH sheep 0.103 0.085 0.96
418 Rutsiro climbing_bean 0.327 0.335 0.965
7 Gatsibo_LWH n_season_compost 5.672 5.780 0.97
229 Mugonero alt 1796.175 1788.315 0.97
48 Gatsibo_NLWH slope 4.011 4.099 0.971
66 Gatsibo_NLWH Total.C 2.002 1.990 0.971
189 Kayonza n_season_fallow 0.455 0.483 0.971
302 Nyamasheke valley 0.150 0.144 0.971
407 Rutsiro n_seasons_leg_2 2.036 2.078 0.971
303 Nyamasheke hillside 0.823 0.829 0.971
14 Gatsibo_LWH valley 0.603 0.593 0.979
15 Gatsibo_LWH hillside 0.397 0.407 0.979
237 Mugonero sheep 0.221 0.233 0.984
65 Gatsibo_NLWH Total.N 0.157 0.158 0.985
167 Karongi sorghum 0.147 0.151 0.986
234 Mugonero goats 1.321 1.302 0.986
331 Nyanza n_season_compost 3.261 3.326 0.986
342 Nyanza goats 1.261 1.283 0.997
245 Mugonero Total.N 0.153 0.153 0.999
12 Gatsibo_LWH slope 4.259 4.220 1
46 Gatsibo_NLWH n_seasons_leg_1 1.540 1.556 1
80 Gisagara n_season_lime 0.022 0.022 1
93 Gisagara sheep 0.022 0.022 1
94 Gisagara climbing_bean 0.022 0.022 1
108 Gisagara own.sheep 0.022 0.022 1
153 Karongi n_season_fallow 0.834 0.843 1
154 Karongi n_seasons_leg_1 2.485 2.497 1
156 Karongi slope 11.975 11.937 1
157 Karongi alt 1806.841 1806.879 1
187 Kayonza n_season_compost 3.564 3.534 1
193 Kayonza alt 1196.807 1202.646 1
230 Mugonero valley 0.130 0.132 1
240 Mugonero bush_bean 0.176 0.178 1
246 Mugonero Total.C 2.217 2.218 1
268 Nyamagabe hilltop 0.054 0.054 1
304 Nyamasheke hilltop 0.027 0.027 1
305 Nyamasheke cows 1.245 1.240 1
321 Nyamasheke own.goats 0.442 0.445 1
322 Nyamasheke own.chickens 0.218 0.219 1
362 Nyaruguru age 48.457 48.304 1
372 Nyaruguru slope 9.478 9.457 1
383 Nyaruguru sorghum 0.304 0.304 1
420 Rutsiro bush_bean 0.012 0.012 1
448 Rwamagana hilltop 0.064 0.063 1
16 Gatsibo_LWH hilltop 0.000 0.000 NA
22 Gatsibo_LWH climbing_bean 0.000 0.000 NA
25 Gatsibo_LWH maize 0.000 0.000 NA
58 Gatsibo_NLWH climbing_bean 0.000 0.000 NA
311 Nyamasheke sorghum 0.000 0.000 NA
332 Nyanza n_season_lime 0.000 0.000 NA
340 Nyanza hilltop 0.000 0.000 NA
374 Nyaruguru valley 0.000 0.000 NA
419 Rutsiro sorghum 0.000 0.000 NA

District balance interpretation

Demographic variables interpretation here.

Agricultural practice variables interpretation here

Soil Variables interpretation here

write.csv(dist.output, file=paste(od, "district balance.csv", sep="/"), row.names=T)

Baseline balance by Tubura tenure

Look at farmers by duration of tenure farming with Tubura. We want to understand, at least with an initial naive baseline sense, what is the cumulative effect of Tubura practices on soil health outcomes?

We will look only at current Tubura farmers and compare first year farmers to farmers with more experience with Tubura.

oafOnly <- d[which(d$client==1 & d$total.seasons>=1),]

nTenure <- oafOnly %>% group_by(total.seasons) %>% 
  dplyr::summarize(
    n = n()
  ) %>% ungroup() %>% as.data.frame()
nTenure$val <- paste(nTenure$total.seasons, " (", "n = ", nTenure$n, ")", sep = "")

for(i in 1:length(soilVars)){
print(
  ggplot(oafOnly, aes(x=as.factor(total.seasons), y=oafOnly[,soilVars[i]])) + 
    geom_boxplot() +
    scale_x_discrete(labels=nTenure$val) + 
    theme(legend.position = "bottom", axis.text.x = element_text(angle = 45, hjust = 1)) +
    labs(x="Tubura Tenure", y=soilVars[i], title = paste("RW baseline soil by tenure - ", soilVars[i], sep = ""))
  )  
}

Tenure summaries

tenureSum <- aggregate(oafOnly[,out.list], by=list(oafOnly$total.seasons), function(x){
  round(mean(x, na.rm=T),2)
})
tenureSum <- as.data.frame(t(tenureSum))
colnames(tenureSum) <- c(paste(seq(1,14,1), " seas.", sep = ""))
print(kable(tenureSum))
1 seas. 2 seas. 3 seas. 4 seas. 5 seas. 6 seas. 7 seas. 8 seas. 9 seas. 10 seas. 11 seas. 12 seas. 13 seas. 14 seas.
Group.1 1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00 10.00 11.00 12.00 13.00 14.00
female 0.42 0.48 0.60 0.49 0.59 0.59 0.52 0.50 0.65 0.62 0.75 0.53 0.55 0.48
age 42.63 42.68 44.33 45.58 43.28 43.96 44.19 50.91 51.85 42.76 48.25 48.97 52.73 49.52
hhsize 5.24 5.17 5.08 5.21 6.07 5.58 6.19 5.66 6.65 6.07 6.58 5.53 4.55 6.12
own 0.93 0.95 0.92 0.95 0.98 0.97 1.00 0.93 0.85 1.00 0.92 0.89 0.91 0.95
field.size 606.74 630.98 630.65 551.74 482.02 484.79 412.52 592.95 330.00 614.79 617.83 440.76 280.82 679.71
n_season_fert 1.76 2.32 2.63 3.28 4.67 4.90 6.07 5.59 6.00 5.83 7.25 7.18 7.18 6.79
n_season_compost 5.35 5.37 4.81 6.37 6.87 7.13 8.30 7.43 7.30 7.52 8.67 8.24 8.27 7.81
n_season_lime 0.16 0.27 0.36 0.31 0.22 0.26 0.07 0.07 0.20 0.28 0.08 0.16 0.09 0.48
n_season_fallow 0.65 0.50 0.77 0.90 0.65 0.66 0.67 0.77 1.10 0.21 0.42 0.21 1.82 0.67
n_seasons_leg_1 1.97 2.07 2.10 2.44 2.96 2.89 1.96 2.91 1.60 1.79 2.92 3.53 2.09 3.02
n_seasons_leg_2 2.44 2.54 2.03 2.32 2.67 2.62 4.93 3.50 2.75 2.59 3.92 1.49 2.82 2.12
slope 9.05 8.66 9.65 8.80 9.57 11.84 14.07 11.77 13.60 12.31 14.17 11.26 14.91 14.38
alt 1531.01 1491.56 1550.41 1518.25 1654.93 1582.27 1758.60 1639.74 1812.98 1764.71 1788.95 1640.20 1702.35 1633.76
valley 0.31 0.32 0.27 0.25 0.30 0.22 0.26 0.23 0.00 0.07 0.33 0.18 0.09 0.21
hillside 0.62 0.61 0.71 0.72 0.67 0.74 0.70 0.66 0.95 0.90 0.67 0.79 0.82 0.76
hilltop 0.07 0.06 0.03 0.03 0.02 0.05 0.04 0.11 0.05 0.03 0.00 0.03 0.09 0.02
cows 0.99 1.00 0.94 1.49 1.91 1.28 1.30 1.09 3.25 1.31 1.75 0.97 1.00 1.07
goats 0.97 1.12 1.00 1.28 1.39 1.00 0.78 1.16 1.20 0.90 0.83 1.11 0.45 1.26
chickens 1.07 1.22 1.08 1.20 1.28 0.87 1.78 1.61 1.20 1.45 3.58 1.76 0.45 1.00
pigs 0.55 0.52 0.58 0.51 0.37 0.47 0.48 0.55 0.45 0.55 0.25 0.84 0.64 0.83
sheep 0.20 0.21 0.29 0.34 0.17 0.16 0.44 0.23 0.15 0.41 0.00 0.05 0.18 0.17
climbing_bean 0.14 0.17 0.21 0.17 0.20 0.32 0.52 0.36 0.35 0.45 0.33 0.39 0.55 0.38
sorghum 0.18 0.17 0.12 0.14 0.11 0.06 0.04 0.02 0.05 0.10 0.17 0.05 0.09 0.00
bush_bean 0.14 0.17 0.17 0.17 0.15 0.19 0.04 0.18 0.15 0.03 0.08 0.11 0.18 0.14
maize 0.06 0.07 0.06 0.08 0.09 0.08 0.11 0.11 0.10 0.10 0.08 0.03 0.00 0.05
m3.Ca 890.32 911.95 871.92 918.19 913.35 840.76 771.94 755.95 592.43 582.74 663.89 848.28 592.16 618.11
m3.Mg 195.33 201.53 213.24 210.61 218.70 214.24 210.69 222.18 156.45 172.28 198.05 275.01 151.55 166.45
pH 5.57 5.57 5.57 5.53 5.64 5.52 5.43 5.46 5.19 5.22 5.37 5.57 5.29 5.25
Total.N 0.16 0.16 0.16 0.16 0.15 0.15 0.15 0.14 0.16 0.16 0.14 0.15 0.15 0.16
Total.C 2.11 2.15 2.11 2.15 2.13 2.07 2.05 1.93 2.38 2.16 1.88 1.97 2.00 2.22
ExAl 0.59 0.57 0.62 0.54 0.46 0.61 0.60 0.56 0.91 0.75 0.70 0.48 0.64 0.90
own.cows 0.69 0.63 0.58 0.75 0.72 0.69 0.81 0.70 0.90 0.90 0.75 0.63 0.64 0.81
own.goats 0.45 0.52 0.47 0.46 0.54 0.50 0.48 0.55 0.50 0.48 0.50 0.53 0.27 0.52
own.chickens 0.30 0.35 0.33 0.30 0.33 0.30 0.56 0.43 0.45 0.45 0.58 0.42 0.45 0.33
own.pigs 0.42 0.36 0.32 0.31 0.26 0.33 0.41 0.36 0.35 0.34 0.17 0.39 0.36 0.40
own.sheep 0.13 0.09 0.14 0.17 0.11 0.08 0.26 0.14 0.10 0.14 0.00 0.03 0.09 0.07

Tenure balance

We’re defining Tubura tenure as having 3 or more seasons of experience farming with Tubura. We draw the line at 3 seasons as three seasons of fertilizer use is approximately when we’d expect fertilizer to start to have a detrimental effect on soil health.

oafOnly$tenured <- ifelse(oafOnly$total.seasons>=3,1,0)

tenure <- do.call(rbind, lapply(out.list, function(x) {
  
  out <- t.test(oafOnly[,x] ~ oafOnly[,"tenured"], data=oafOnly)
  tab <- data.frame(out[[5]][[1]], out[[5]][[2]], out[3])
  tab[,1:2] <- round(tab[,1:2],3)
  names(tab) <- c(names(out[[5]]), "pvalue")
  return(tab)
}))

# use p.adjust with bonferroni correction
tenure$pvalue <- p.adjust(tenure$pvalue, method="fdr")
rownames(tenure) <- out.list
tenure <- tenure[order(tenure$pvalue),]
tenure$pvalue <- ifelse(tenure[, 3] < 0.001, "< 0.001", round(tenure[, 3], 3)) 


colnames(tenure) <- c("Non-Tubura", "Tubura Client", "p-value") 
print(kable(tenure))
Non-Tubura Tubura Client p-value
n_season_fert 2.097 4.774 < 0.001
n_season_compost 5.359 6.899 < 0.001
climbing_bean 0.159 0.295 < 0.001
age 42.657 46.057 < 0.001
slope 8.814 11.103 < 0.001
hillside 0.616 0.736 < 0.001
sorghum 0.173 0.086 < 0.001
n_seasons_leg_1 2.030 2.583 0.006
valley 0.319 0.225 0.007
hhsize 5.200 5.597 0.01
cows 0.992 1.358 0.01
female 0.457 0.552 0.012
alt 1507.236 1611.057 0.015
m3.Ca 903.359 813.631 0.034
pH 5.570 5.478 0.036
Total.N 0.159 0.154 0.043
own.cows 0.657 0.719 0.097
field.size 621.349 535.512 0.117
hilltop 0.065 0.039 0.178
own.pigs 0.386 0.334 0.188
n_season_fallow 0.559 0.719 0.209
m3.Mg 199.065 208.991 0.334
Total.C 2.136 2.102 0.486
ExAl 0.574 0.613 0.518
own.sheep 0.103 0.122 0.518
own.chickens 0.332 0.360 0.525
sheep 0.203 0.242 0.598
maize 0.065 0.077 0.6
n_season_lime 0.227 0.259 0.621
chickens 1.162 1.261 0.633
goats 1.062 1.111 0.726
own 0.941 0.947 0.767
n_seasons_leg_2 2.500 2.575 0.767
bush_bean 0.159 0.151 0.767
pigs 0.532 0.545 0.871
own.goats 0.492 0.492 0.991

Tenured v. new balance interpretation

Demographic variables We are well balanced along demographic variables.

Agricultural practice variables Not surprisingly, Tubura farmers have more cumulative years of fertilizer use than current non-Tubura farmers. While that difference is signficant, it is realistically only a single season of fertilizer use different.

Interestingly, non-Tubura farmers reported using more lime than current Tubura farmers. This

Soil Variables Soil pH, calcium and magnesium levels are lower for tenured Tubura farmers. This is consistent with the hypothesis that increaesd fertilizer use leads to an increaese in soil acidity.

Analysis of agronomic practices contributing to soil health outcomes

Here’s where we’ll look at the contribution of fertilizer, lime and cultivation practices on soil health outcomes. This analysis will be come richer as we gain longitudinal measures. I caution that we cannot treat these relationships as causal. The direction of causality is not clearly delineated in the data or the study design. However, we can identify meaningful connections between practices and outcomes through this analysis to generate new hypotheses for field testing.

I’m going to start with behaviors by sections and then move to a more comprehensive model including multiple practices. All models will include controls for site to account for local variation and field officer behavior.

Agronomic Behaviors

Check for multicollinearity before adding number of seasons of agronomic inputs on the same side of the regression.

suppressMessages(library(stargazer))

inputUse <- c("n_season_fert","n_season_compost", "n_season_lime", "n_season_fallow")

cor(d[,inputUse], use="complete.obs")
##                  n_season_fert n_season_compost n_season_lime
## n_season_fert       1.00000000       0.35646933    0.17075604
## n_season_compost    0.35646933       1.00000000    0.03828723
## n_season_lime       0.17075604       0.03828723    1.00000000
## n_season_fallow    -0.06439632      -0.16923273   -0.01116383
##                  n_season_fallow
## n_season_fert        -0.06439632
## n_season_compost     -0.16923273
## n_season_lime        -0.01116383
## n_season_fallow       1.00000000

Interpretation: The strongest correlation between the input use intensity variables is between seasons of fertilizer and compost use, ~0.35. While this is on the higher end it’s not necessarily cause for alarm.

inputUse <- paste(c("n_season_fert","n_season_compost", "n_season_lime", "n_season_fallow"), collapse= " + ")

list1 <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~", inputUse, "+ as.factor(cell)", sep="")), data=d)
  return(mod)
})

stargazer(list1, type="html", 
          title = "2016A Rwanda Soil Health Baseline - Naive Agronomic Practice Models",
          covariate.labels = c("Seasons of Fertilizer", "Seasons of Compost", 
                               "Seasons of Lime", "Seasons of Fallow"),
          dep.var.caption = "",
          dep.var.labels = "",
          column.labels = c(gsub("m3.","", soilVars)),
          notes = "Includes FE for cell",
          omit=c("cell"), out=paste(od, "rw_baseline_agprac.htm", sep="/"))
2016A Rwanda Soil Health Baseline - Naive Agronomic Practice Models
Ca Mg pH Total.N Total.C ExAl
(1) (2) (3) (4) (5) (6)
Seasons of Fertilizer -5.326 -1.128 -0.001 -0.0004** -0.003 -0.001
(3.321) (0.702) (0.003) (0.0002) (0.003) (0.004)
Seasons of Compost 1.727 0.363 0.004 0.0001 0.002 -0.006**
(2.680) (0.567) (0.003) (0.0001) (0.003) (0.003)
Seasons of Lime 41.659*** 7.998*** 0.024* 0.002** 0.022 0.009
(14.473) (3.061) (0.014) (0.001) (0.015) (0.016)
Seasons of Fallow -19.069*** -3.517*** -0.028*** 0.0005 0.009* 0.025***
(5.283) (1.117) (0.005) (0.0003) (0.005) (0.006)
Constant 548.378 68.041 4.684*** 0.199*** 3.360*** 1.904***
(387.623) (81.977) (0.370) (0.021) (0.392) (0.420)
Observations 2,443 2,443 2,443 2,443 2,443 2,443
R2 0.555 0.585 0.612 0.502 0.456 0.570
Adjusted R2 0.514 0.547 0.577 0.456 0.407 0.531
Residual Std. Error (df = 2238) 387.475 81.946 0.370 0.021 0.392 0.420
F Statistic (df = 204; 2238) 13.684*** 15.452*** 17.322*** 11.044*** 9.202*** 14.545***
Note: p<0.1; p<0.05; p<0.01
Includes FE for cell

Interpretation The naive model suggests that when we include site level fixed effects, duration of agronomic practices don’t have a big effect on soil health outcomes. However, some of the practice intensity variables are not well distributed. Let’s take a look at a log transformation. I’m adding one to the variables as to not end up with lots of Inf values.

Log transformations in theory are appropriate for variables that are right skewed (vavlues clustered to the left of the distribution) and see diminishing returns to increasing values. The shape of the data suggests a log transformation but it’s debateable whether the relationship is diminishing.

agPrac <- c(names(d[grep('n_season_', names(d))]))

for(i in 1:length(agPrac)){
  print(
    ggplot(d, aes(x=d[,agPrac[i]])) + geom_density() +
      labs(x = paste(agPrac[i], " No transform", sep = ""))
        )
}

# since these are all skewed, consider a log transform 
for(i in 1:length(agPrac)){
  print(
    ggplot(d, aes(x=log10(d[,agPrac[i]]+1))) + geom_density() + 
      labs(x = paste(agPrac[i], " Log transform", sep = ""))
        )
}

# look at other transfomations
for(i in 1:length(agPrac)){
  print(
    ggplot(d, aes(x=d[,agPrac[i]]^(1/3))) + geom_density() +
      labs(x = paste(agPrac[i], " cubic root transform", sep = ""))
        )
}

# visualize the outcomes as well to see if a transformation is warranted
for(i in 1:length(soilVars)){
  print(
    ggplot(d, aes(x=d[,soilVars[i]])) + geom_density() +
      labs(x = soilVars[i], title = soilVars[i])
        )
}

d$logFert <- log(d$n_season_fert+1)
d$logCompost <- log(d$n_season_compost+1)
d$logLime <- log(d$n_season_lime+1)
d$logFallow <- log(d$n_season_fallow+1)

Or look at BoxCox graph to empirically determine the right transformation. Log is assuming a diminishing return to an increasing X. That’s probably not the case with fertilizer. We’d actually expect an increasing return as values get larger. We use boxcox to see what the data suggest. We interpret it as follows:

  • lambda = 2 -> square
  • lambda = 1 -> no transformation
  • lambda = 0.5 -> square root
  • lambda = 0 -> log
  • lamdba = -1 -> inverse
library(MASS)
for(i in 1:length(agPrac)){
boxcox(lm(pH ~ d[,agPrac[i]], data=d))
}

For pH at least it seems like a log transform is appropriate. We can run this for all other variables as well to see what we get as well.

Let’s look at the log results: See here and here for guidance on intepreting log transformed right hand side variables. See here for additional guidance on choosing a transformation.

How to interpret RHS log transform: For a linear multivariate OLS regression, we say “a one unit increase in X causes a (coefficient) change in Y.” For a linear-log regression where the X variable is log transformed, we say a L percent change in X leads to a (coefficient*L) change in Y.

Question: Do we want to run this analysis for only OAF farmers? If so, adjust the sample accordingly.

logVars <- paste(names(d[grep("log", names(d))]), collapse=" + ")

list2 <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~",  logVars, "+ as.factor(cell)", sep="")), data=d)
  return(mod)
})

list2b <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~",  logVars, sep="")), data=d)
  return(mod)
})
suppressWarnings(
  stargazer(list2, list2b, type="html", 
          title = "2016A Rwanda Soil Health Baseline - Log Agronomic Practice Models",
          covariate.labels = c("Seasons of Fertilizer (log)", "Seasons of Compost (log)", "Seasons of Lime (log)", "Seasons of Fallow (log)"),
          column.labels = c(rep(gsub("m3.","", soilVars),2)),
          dep.var.caption = "",
          dep.var.labels = c("",""),
          add.lines = list(c("Cell FE?", rep("Yes", 5), rep("No", 5))),
          notes = "Includes FE for cell",
          omit=c("cell"), out=paste(od, "rw_baseline_agprac_log.htm", sep="/"))
)
plm.log <- function(x, range){
  
  beta = round(summary(x)$coefficients[range,1],3)
  beta.pval = summary(x)$coefficients[range,4]
  beta.conv = ifelse(beta.pval < 0.01, "***", ifelse(
    beta.pval < 0.05, "**", ifelse(
      beta.pval < 0.1, "*", "")))
  #beta.pval = round(beta.pval, 3)
  outcome = paste(beta, beta.conv, sep = "")
  outcome = c(outcome, unique(round(summary(x)$coefficients[1,1],3)))
  res = data.frame(outcome, stringsAsFactors = F)
  
  return(res)  
}

rw.table15 <- do.call(cbind, lapply(list2, function(x){
  plm.log(x, 2:5)
}))

colnames(rw.table15) <- soilVars
rownames(rw.table15) <- c(names(d)[grep("log", names(d))], "constant")

Table 15 outcomes

rw.table15 <- rw.table15[,c("pH", "Total.C", "Total.N", "m3.Ca", "m3.Mg", "ExAl")]

write.csv(rw.table15, file=paste(od, "rwtable15_reg.csv", sep = "/"),
          row.names = T)
# a 10 percent increase in x leads to B1*.1 change in Y
logTrans <- do.call(cbind, lapply(list2, function(x){
  coeff = x$coefficients[2:5]
  tenPercent = round(coeff*.1, 5)
  names(tenPercent) <- paste("10% increase in ", gsub("log","", names(tenPercent)), " leads to:", sep="")
  return(tenPercent)
}))
colnames(logTrans) <- soilVars
print(kable(logTrans))
m3.Ca m3.Mg pH Total.N Total.C ExAl
10% increase in Fert leads to: -2.31951 -0.52268 -0.00091 -0.00016 -0.00132 -0.00013
10% increase in Compost leads to: 1.00464 0.17075 0.00180 0.00007 0.00074 -0.00217
10% increase in Lime leads to: 12.28263 2.25366 0.00658 0.00053 0.00676 0.00040
10% increase in Fallow leads to: -7.04768 -1.33558 -0.01025 0.00011 0.00222 0.00895

Interpretation: When we transform the variables to log, the data starts to tell a more coherent story, at least directionally. If we remove the district FE, the coefficients gain significance.

  • Additional seasons of fertilizer use relates to a decrease in soil N and C.
  • Seasons of compost and lime have the opposite, positive effect on soil N.
  • Unsurprisingly, soil health metrics are poorer for soils on which more fertilizer has been used and lime has not been used.

Tenure with One Acre Fund

Let’s look first at a naive model of One Acre Fund tenure on soil health. Remember: these data are not longitudinal! These data are not longitudinal and reflect farmer selection into One Acre Fund. While these models will try to be both robust and parsimonious, we will inevitabily suffer omitted variable bias due to a lack of an instrument.

list3 <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~",  "total.seasons + as.factor(cell)", sep="")), data=d)
  return(mod)
})

stargazer(list3, type="html", 
          title = "2016A Rwanda Soil Health Baseline - Naive Tenure Models",
          covariate.labels = c("OAF Tenure"),
          dep.var.caption = "",
          dep.var.labels = "",
          column.labels = c(gsub("m3.","", soilVars)),
          notes = "Includes FE for cell",
          omit=c("cell"), out=paste(od, "rw_baseline_tenure.htm", sep="/"))
2016A Rwanda Soil Health Baseline - Naive Tenure Models
Ca Mg pH Total.N Total.C ExAl
(1) (2) (3) (4) (5) (6)
OAF Tenure -0.817 -0.104 0.002 -0.0002 -0.001 -0.006*
(2.856) (0.603) (0.003) (0.0002) (0.003) (0.003)
Constant 556.919 69.700 4.693*** 0.200*** 3.370*** 1.893***
(389.276) (82.243) (0.373) (0.021) (0.392) (0.422)
Observations 2,443 2,443 2,443 2,443 2,443 2,443
R2 0.550 0.581 0.606 0.500 0.455 0.565
Adjusted R2 0.510 0.544 0.570 0.455 0.406 0.527
Residual Std. Error (df = 2241) 389.234 82.235 0.373 0.021 0.392 0.422
F Statistic (df = 201; 2241) 13.648*** 15.480*** 17.114*** 11.128*** 9.300*** 14.510***
Note: p<0.1; p<0.05; p<0.01
Includes FE for cell

Interpretation: The naive One Acre Fund tenure model suggest that across the board that additional years of 1AF practices have a negative effect on soil health parameters. Let’s combine 1AF tenure with the agronomic practices model above to build a more robust model:

list4 <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~",  logVars, "+ total.seasons + as.factor(cell)", sep="")), data=d)
  return(mod)
})

stargazer(list4, type="html", 
          title = "2016A Rwanda Soil Health Baseline - Ag Practice and Tenure",
          covariate.labels = c("Seasons of Fertilizer (log)", "Seasons of Compost (log)", "Seasons of Lime (log)", "Seasons of Fallow (log)", "OAF Tenure"),
          dep.var.caption = "",
          dep.var.labels = "",
          column.labels = c(gsub("m3.","", soilVars)),
          notes = "Includes FE for cell",
          omit=c("cell"), out=paste(od, "rw_baseline_ag_tenure.htm", sep="/"))
2016A Rwanda Soil Health Baseline - Ag Practice and Tenure
Ca Mg pH Total.N Total.C ExAl
(1) (2) (3) (4) (5) (6)
Seasons of Fertilizer (log) -27.500** -6.689** -0.019 -0.001* -0.011 0.017
(13.945) (2.951) (0.013) (0.001) (0.014) (0.015)
Seasons of Compost (log) 10.127 1.735 0.018 0.001 0.007 -0.022*
(12.169) (2.575) (0.012) (0.001) (0.012) (0.013)
Seasons of Lime (log) 123.055*** 22.614*** 0.066** 0.005*** 0.068** 0.003
(32.649) (6.909) (0.031) (0.002) (0.033) (0.035)
Seasons of Fallow (log) -70.840*** -13.479*** -0.103*** 0.001 0.022 0.091***
(15.501) (3.280) (0.015) (0.001) (0.016) (0.017)
OAF Tenure 1.921 0.653 0.004 -0.0001 -0.001 -0.008**
(3.481) (0.737) (0.003) (0.0002) (0.004) (0.004)
Constant 535.144 65.394 4.660*** 0.198*** 3.357*** 1.933***
(386.869) (81.862) (0.369) (0.021) (0.392) (0.420)
Observations 2,443 2,443 2,443 2,443 2,443 2,443
R2 0.558 0.587 0.616 0.502 0.456 0.572
Adjusted R2 0.517 0.549 0.581 0.457 0.407 0.533
Residual Std. Error (df = 2237) 386.306 81.743 0.368 0.021 0.392 0.419
F Statistic (df = 205; 2237) 13.771*** 15.513*** 17.499*** 11.012*** 9.161*** 14.607***
Note: p<0.1; p<0.05; p<0.01
Includes FE for cell

Interpretation: Including agronomic practices and 1AF tenure in the same model dampens the magnitude, but not the significance, of 1AF tenure on soil health outcomes.

Agronomic practices - 15B cultivation practices

Thus far we have looked at aggregated historical plot level practices and their effect on soil health. We also asked farmers about their cultivation practices on their plot in the previous season, 15B. We have more precise information for fertilizer, compost and liming practices for the 15B season.

# scale all the field application variables to ares
d$ares <- d$field.size/100
d$fert1.are <- d$field_kg_fert1_15b/d$ares
d$fert2.are <- d$field_kg_fert2_15b/d$ares
d$fert.total.are <- apply(d[,c("fert1.are", "fert2.are")], 1, function(x){
  sum(x, na.rm=T)
})

d$fert.total.are <- ifelse(is.na(d$fert1.are) & is.na(d$fert2.are), NA, d$fert.total.are)

d$compost.are <- d$field_kg_compost_15b/d$ares
d$lime.are <- d$kg_lime_15b/d$ares

intensityVars <- c("fert1.are", "fert2.are",
                   "compost.are", "lime.are")

cor(d[,intensityVars], use="complete.obs")
##             fert1.are fert2.are compost.are  lime.are
## fert1.are   1.0000000 0.9889407   0.5216134 0.8679373
## fert2.are   0.9889407 1.0000000   0.5328874 0.8539114
## compost.are 0.5216134 0.5328874   1.0000000 0.6236616
## lime.are    0.8679373 0.8539114   0.6236616 1.0000000
#table(d$field_15b_fert_t, useNA = 'ifany')
d$field_15b_fert_t <- tolower(d$field_15b_fert_t)
names(d)[names(d)=="v69"] <- "field_15b_fert_t2"
#table(d$field_15b_fert_t2)
d$field_15b_fert_t2 <- tolower(d$field_15b_fert_t2)
d$dap1 <- ifelse(d$field_15b_fert_t=="dap", d$field_kg_fert1_15b, NA)
d$dap1.are <- d$dap1/d$ares
d$npk1 <- ifelse(grepl("npk", d$field_15b_fert_t), d$field_kg_fert1_15b, NA)
d$npk1.are <- d$npk1/d$ares
d$urea1 <- ifelse(d$field_15b_fert_t=="urea", d$field_kg_fert1_15b, NA)
d$urea1.are <- d$urea1/d$ares
d$urea2 <- ifelse(d$field_15b_fert_t2=="urea", d$field_kg_fert2_15b, NA)
d$urea2.are <- d$urea2/d$ares
d$total.urea.are <- apply(d[,c("urea1.are", "urea2.are")], 1, function(x){
  sum(x, na.rm=T)
})
d$total.urea.are <- ifelse(is.na(d$urea1.are) & is.na(d$urea2.are), NA, d$total.urea.are)

Graph the application/are variables

for(i in 1:length(intensityVars)){
  print(
  ggplot(d, aes(x=d[,intensityVars[i]])) + geom_density() +
      labs(x = intensityVars[i], title = intensityVars[i])
  )
}
## Warning: Removed 1800 rows containing non-finite values (stat_density).

## Warning: Removed 2233 rows containing non-finite values (stat_density).

## Warning: Removed 143 rows containing non-finite values (stat_density).

## Warning: Removed 2399 rows containing non-finite values (stat_density).

Conclusion - Take 1: The application rate per are variables are weird. I think it’s because of the field dimensions. I’m going to go back to the field dimensions and check this.

Let’s look at the dimensions of the fields that have large application rates

d[d$fert1.are>10 & !is.na(d$fert1.are), c("field_dim1", "field_dim2", "field.size", "fert1.are")]
##      field_dim1 field_dim2 field.size fert1.are
## 828          48         12        576  15.27778
## 837          10         25        250  35.20000
## 936           5          8         40  25.00000
## 946          25         25        625  14.08000
## 1158          4         20         80  12.50000
## 2380          3          5         15  13.33333
d[d$fert2.are>10 & !is.na(d$fert2.are), c("field_dim1", "field_dim2", "field.size", "fert2.are")]
##     field_dim1 field_dim2 field.size fert2.are
## 828         48         12        576  15.27778
## 830          8         35        280  31.42857
## 837         10         25        250  35.20000
## 877         10          5         50  42.00000
d[d$compost.are>500 & !is.na(d$compost.are), c("field_dim1", "field_dim2", "field.size", "compost.are")]
##      field_dim1 field_dim2 field.size compost.are
## 93           13          8        104    769.2308
## 94            5          4         20   1000.0000
## 181           6          6         36    833.3333
## 205          10          5         50   1000.0000
## 383           2         20         40    750.0000
## 764          26          1         26    807.6923
## 885          15          2         30    666.6667
## 936           5          8         40   1250.0000
## 1128         15          6         90   2223.3333
## 1525         12        123       1476   1507.4526
## 1627         33          3         99    606.0606
## 2366          7         40        280   1714.6429
## 2380          3          5         15    666.6667
# there's a field that is 1 meter wide? Surely not.

d[abs(d$lime.are)>40 & !is.na(d$lime.are), c("field_dim1", "field_dim2", "field.size", "lime.are")]
##      field_dim1 field_dim2 field.size lime.are
## 1101         11         12        132 113.6364
## 1119         25         10        250  60.0000
# how is there a negative quantity of lime?

Table 16 - Kgs of soil ammendments - fertilizer (kg)

# this should be kg of fertilizer used in this field. Compost is off the charts. Convert this to compost per sq meter
plm.t16 <- function(x, range){
  
  beta = round(summary(x)$coefficients[range,1],3)
  beta.pval = round(summary(x)$coefficients[range,4],3)
  beta.conv = ifelse(beta.pval < 0.01, "***", ifelse(
    beta.pval < 0.05, "**", ifelse(
      beta.pval < 0.1, "*", "")))
  #beta.pval = round(beta.pval, 3)
  outcome = paste(beta, " (", beta.pval, ")", sep = "")
  outcome = c(outcome, unique(round(summary(x)$coefficients[1,1],3)))
  res = data.frame(outcome, stringsAsFactors = F)
  
  return(res)  
}


previousSeasonfert <- paste("fert1.are", "as.factor(cell)", sep=" + ")

list5 <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~", previousSeasonfert, sep="")), data=d)
  return(mod)
})

table16 <- do.call(cbind, lapply(list5, function(x){
  plm.t16(x, 2)
}))
colnames(table16) <- soilVars
rownames(table16) <- c("Fertilizer (kg/are)", "constant")

table16 <- table16[,c("pH", "Total.C", "Total.N", "m3.Ca", "m3.Mg", "ExAl")]
write.csv(table16, file=paste(od, "rwtable16_fert.csv", sep = "/"),
          row.names = T)

Table 16 - all fertilizer used

previousSeasonfertAll <- paste("fert.total.are", "as.factor(cell)", sep=" + ")

list5a <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~", previousSeasonfertAll, sep="")), data=d)
  return(mod)
})

table16a <- do.call(cbind, lapply(list5a, function(x){
  plm.t16(x, 2)
}))
colnames(table16a) <- soilVars
rownames(table16a) <- c("Fertilizer (all) (kg/are)", "constant")

table16a <- table16a[,c("pH", "Total.C", "Total.N", "m3.Ca", "m3.Mg", "ExAl")]
write.csv(table16a, file=paste(od, "rwtable16_fert_all.csv", sep = "/"),
          row.names = T)

Table 16 - Kgs of soil ammendments - compost (kg)

# these objects have the same name as the fertilizer objects for simplicity. Run
# from the top to avoid overwriting issues.
previousSeasoncompost <- paste("compost.are", "as.factor(cell)", sep=" + ")
list5b <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~", previousSeasoncompost, sep="")), data=d)
  return(mod)
})

table16b <- do.call(cbind, lapply(list5b, function(x){
  plm.t16(x, 2)
}))
colnames(table16b) <- soilVars
rownames(table16b) <- c("Compost (kg/are)", "constant")

table16b <- table16b[,c("pH", "Total.C", "Total.N", "m3.Ca", "m3.Mg", "ExAl")]
write.csv(table16b, file=paste(od, "rwtable16_compost.csv", sep = "/"),
          row.names = T)

Table 16 - Kgs of soil ammendments - lime (kg)

# these objects have the same name as the fertilizer objects for simplicity. Run
# from the top to avoid overwriting issues.
previousSeasonlime <- paste("lime.are", "as.factor(cell)", sep=" + ")
list5c <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~", previousSeasonlime, sep="")), data=d)
  return(mod)
})

table16c <- do.call(cbind, lapply(list5c, function(x){
  plm.t16(x, 2)
}))
colnames(table16c) <- soilVars
rownames(table16c) <- c("Lime (kg/are)", "constant")

table16c <- table16c[,c("pH", "Total.C", "Total.N", "m3.Ca", "m3.Mg", "ExAl")]
write.csv(table16c, file=paste(od, "rwtable16_lime.csv", sep = "/"),
          row.names = T)

Table 16 - types of fertilizer - DAP (kg)

previousSeasonDAP <- paste("dap1.are", "as.factor(cell)", sep=" + ")
list5d <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~", previousSeasonDAP, sep="")), data=d)
  return(mod)
})

table16d <- do.call(cbind, lapply(list5d, function(x){
  plm.t16(x, 2)
}))
colnames(table16d) <- soilVars
rownames(table16d) <- c("DAP (kg/are)", "constant")

table16d <- table16d[,c("pH", "Total.C", "Total.N", "m3.Ca", "m3.Mg", "ExAl")]
write.csv(table16d, file=paste(od, "rwtable16_dap.csv", sep = "/"),
          row.names = T)

Table 16 - types of fertilizer - NPK (kg)

previousSeasonNPK <- paste("npk1.are", "as.factor(cell)", sep=" + ")
list5e <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~", previousSeasonNPK, sep="")), data=d)
  return(mod)
})

table16e <- do.call(cbind, lapply(list5e, function(x){
  plm.t16(x, 2)
}))
colnames(table16e) <- soilVars
rownames(table16e) <- c("NPK (kg/are)", "constant")

table16e <- table16e[,c("pH", "Total.C", "Total.N", "m3.Ca", "m3.Mg", "ExAl")]
write.csv(table16e, file=paste(od, "rwtable16_npk.csv", sep = "/"),
          row.names = T)

Table 16 - types of fertilizer - Urea (kg)

previousSeasonUrea <- paste("total.urea.are", "as.factor(cell)", sep=" + ")
list5f <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~", previousSeasonUrea, sep="")), data=d)
  return(mod)
})

table16f <- do.call(cbind, lapply(list5f, function(x){
  plm.t16(x, 2)
}))
colnames(table16f) <- soilVars
rownames(table16f) <- c("Urea (kg/are)", "constant")

table16f <- table16f[,c("pH", "Total.C", "Total.N", "m3.Ca", "m3.Mg", "ExAl")]
write.csv(table16f, file=paste(od, "rwtable16_urea.csv", sep = "/"),
          row.names = T)
stargazer(list5, type="html", 
          title = "2016A Rwanda Soil Health Baseline - 15B practices",
          covariate.labels = c("Fertilizer Rate (log)"),
          dep.var.caption = "",
          dep.var.labels = "",
          column.labels = c(gsub("m3.","", soilVars)),
          notes = "Includes FE for cell",
          omit=c("cell"), out=paste(od, "rw_baseline_15b_ag.htm", sep="/"))

Farmer fertility perception

Let’s look at farmer perceived fertility as a predictor of soil health. We’ll set ‘same fertility’ as the reference category.

d$fertility_qual <- relevel(as.factor(d$general_field_infocompare_fertil), ref="same")

list6 <- lapply(soilVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~", "+ fertility_qual + as.factor(cell)", sep="")), data=d)
  return(mod)
})


stargazer(list6, type="html", 
          title = "2016A Rwanda Soil Health Baseline - Farmer Perceived Fertility",
          covariate.labels = c("Farmer Opinion - Less Fertile",
                               "Farmer Opinion - More Fertile"),
          dep.var.caption = "",
          dep.var.labels = "",
          column.labels = c(gsub("m3.","", soilVars)),
          notes = "Reference category is same fertility as other fields. Includes FE for cell",
          notes.align = "l",
          omit=c("cell"), out=paste(od, "rw_baseline_fertility_qual.htm", sep="/"))
2016A Rwanda Soil Health Baseline - Farmer Perceived Fertility
Ca Mg pH Total.N Total.C ExAl
(1) (2) (3) (4) (5) (6)
Farmer Opinion - Less Fertile -110.261*** -19.192*** -0.124*** -0.0002 0.003 0.139***
(21.695) (4.593) (0.021) (0.001) (0.022) (0.024)
Farmer Opinion - More Fertile 7.185 1.252 -0.005 -0.001 -0.001 0.001
(23.953) (5.071) (0.023) (0.001) (0.024) (0.026)
Constant 665.546* 88.684 4.822*** 0.200*** 3.364*** 1.742***
(387.433) (82.024) (0.370) (0.021) (0.393) (0.420)
Observations 2,443 2,443 2,443 2,443 2,443 2,443
R2 0.556 0.585 0.612 0.499 0.455 0.572
Adjusted R2 0.516 0.548 0.577 0.454 0.406 0.533
Residual Std. Error (df = 2240) 386.825 81.895 0.370 0.021 0.392 0.419
F Statistic (df = 202; 2240) 13.893*** 15.629*** 17.488*** 11.063*** 9.248*** 14.820***
Note: p<0.1; p<0.05; p<0.01
Reference category is same fertility as other fields. Includes FE for cell

Interpretation: Farmers understand their fields well. Their categorization of which field are more and less fertile corresponds to our quantified measures of soil health. The only features farmers don’t seem to get correct are nitrogen and carbon. The nitrogen and carbon levels are indistinguishable in ‘low fertility’ fields relative to the fields deemed to be the ‘same fertility.’ Reminder: We need to remember that farmers are only evaluting one of their fields thus we are not able to account for the quality of the farmer in assessing his/her fields.

Deeper dive into farmer perception of fertility

  • Look at wet chem values for all soil features by farmer perception
  • Run PCA on all soil attributes to see which principle components predict soil fertility
  • Consider adding additional control variables into model (like Ca as control for pH perception model)
# merge wetChem in with d
names(wetChem)[2:21] <- paste("wet.", names(wetChem)[2:21], sep = "")
d <- left_join(d, wetChem, by="SSN")
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factor and character vector, coercing into character vector
wetVars <- names(d)[grep("wet.", names(d))]
for(i in 1:length(wetVars)){
  print(
  ggplot(data=d, aes(x=as.factor(client), y=d[,wetVars[i]])) + 
    geom_boxplot() +
    labs(x="Tubura Farmer", y=wetVars[i], title = paste("RW baseline wet chem - ", wetVars[i], sep = ""))
  )
}
## Warning: Removed 2206 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2206 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

## Warning: Removed 2199 rows containing non-finite values (stat_boxplot).

list7 <- lapply(wetVars, function(x){
  mod <- lm(as.formula(paste("d[,x] ~", "+ fertility_qual + as.factor(cell)", sep="")), data=d)
  return(mod)
})

suppressWarnings(
stargazer(list7, type="html", 
          title = "2016A Rwanda Soil Health Baseline - Farmer Perceived Fertility (wet chem)",
          covariate.labels = c("Farmer Opinion - Less Fertile",
                               "Farmer Opinion - More Fertile"),
          dep.var.caption = "",
          dep.var.labels = "",
          column.labels = c(gsub("wet.","", wetVars)),
          notes = "Reference category is same fertility as other fields. Includes FE for cell",
          notes.align = "l",
          omit=c("cell"), out=paste(od, "rw_baseline_fertility_qual_wet.htm", sep="/"))
)
2016A Rwanda Soil Health Baseline - Farmer Perceived Fertility (wet chem)
pH EC..S. P K Ca Mg S Na Fe Mn B Cu Zn C.E.C TN C.N Exch..Acidity Acid.Saturation Exch.Al OC
(1) (2) (3) (4) (5) (6) (7) (8) (9) (10) (11) (12) (13) (14) (15) (16) (17) (18) (19) (20)
Farmer Opinion - Less Fertile -0.227 1.930 -9.111 -44.922 -23.678 -22.846 2.274 -0.079 22.320 -29.748 -0.064 -0.317 -0.224 0.138 0.004 -0.277 0.343** 9.657** 0.268** 0.153
(0.147) (11.629) (19.670) (28.712) (150.522) (36.856) (2.236) (2.929) (17.023) (21.301) (0.222) (0.337) (0.764) (1.126) (0.008) (0.627) (0.160) (3.686) (0.124) (0.135)
Farmer Opinion - More Fertile -0.109 -7.216 -16.525 -69.198** -187.493 -57.688 2.223 -0.854 -12.790 -10.391 -0.069 -0.278 -1.040 -1.369 -0.016* 0.287 0.125 3.831 0.091 -0.192
(0.174) (13.775) (23.740) (34.653) (181.669) (44.482) (2.699) (3.535) (20.546) (25.708) (0.268) (0.407) (0.922) (1.359) (0.010) (0.757) (0.193) (4.449) (0.150) (0.163)
Constant 6.225*** 127.000*** 8.910 213.000*** 1,745.000*** 435.000*** 15.700** 24.300*** 61.450 194.000*** 0.415 2.670*** 4.415** 15.650*** 0.195*** 11.750*** 0.140 1.100 0.023 2.335***
(0.406) (32.067) (55.599) (81.159) (425.475) (104.178) (6.321) (8.278) (48.119) (60.210) (0.627) (0.953) (2.160) (3.182) (0.023) (1.772) (0.451) (10.419) (0.351) (0.383)
Observations 237 237 244 244 244 244 244 244 244 244 244 244 244 244 244 244 244 244 244 244
R2 0.742 0.580 0.702 0.653 0.760 0.755 0.729 0.705 0.769 0.688 0.525 0.693 0.540 0.752 0.773 0.671 0.775 0.777 0.797 0.731
Adjusted R2 0.425 0.064 0.348 0.240 0.475 0.463 0.407 0.354 0.494 0.318 -0.039 0.329 -0.006 0.457 0.504 0.280 0.506 0.511 0.555 0.412
Residual Std. Error 0.574 (df = 106) 45.349 (df = 106) 78.629 (df = 111) 114.776 (df = 111) 601.713 (df = 111) 147.330 (df = 111) 8.940 (df = 111) 11.707 (df = 111) 68.051 (df = 111) 85.150 (df = 111) 0.887 (df = 111) 1.347 (df = 111) 3.055 (df = 111) 4.501 (df = 111) 0.033 (df = 111) 2.506 (df = 111) 0.638 (df = 111) 14.735 (df = 111) 0.496 (df = 111) 0.541 (df = 111)
F Statistic 2.344*** (df = 130; 106) 1.125 (df = 130; 106) 1.983*** (df = 132; 111) 1.580*** (df = 132; 111) 2.664*** (df = 132; 111) 2.586*** (df = 132; 111) 2.264*** (df = 132; 111) 2.010*** (df = 132; 111) 2.796*** (df = 132; 111) 1.857*** (df = 132; 111) 0.930 (df = 132; 111) 1.902*** (df = 132; 111) 0.988 (df = 132; 111) 2.551*** (df = 132; 111) 2.867*** (df = 132; 111) 1.715*** (df = 132; 111) 2.888*** (df = 132; 111) 2.923*** (df = 132; 111) 3.293*** (df = 132; 111) 2.290*** (df = 132; 111)
Note: p<0.1; p<0.05; p<0.01
Reference category is same fertility as other fields. Includes FE for cell

Interpretation: Our sample size decreases considerably when we look only at wet chemistry results. Thus, we do not see the same signficant relationships we saw between farmer perceived fertility and soil characteristics we saw when looking at the predicted values.

Principal Component Analysis (predicted values)

pca1 <- prcomp(d[,soilVars], scale=TRUE, center=TRUE)
print(pca1)
## Standard deviations:
## [1] 1.8130261 1.4406298 0.5220816 0.4155603 0.3485684 0.2660126
## 
## Rotation:
##                PC1         PC2         PC3         PC4         PC5
## m3.Ca    0.4639924 -0.32390991  0.05174966 -0.49857472  0.10898630
## m3.Mg    0.4680317 -0.22493445  0.73859014  0.35920491  0.02360238
## pH       0.5252283  0.01946359 -0.31205314 -0.40917507 -0.29772280
## Total.N -0.1011532 -0.65539834 -0.27462802  0.04606937  0.62468884
## Total.C -0.1777593 -0.62959112 -0.12420105  0.20482827 -0.71038388
## ExAl    -0.4979581 -0.13481869  0.51340332 -0.64101002 -0.06359753
##                 PC6
## m3.Ca    0.64549254
## m3.Mg   -0.23505354
## pH      -0.60853607
## Total.N -0.30404269
## Total.C  0.09995442
## ExAl    -0.23524504
plot(pca1)

pca1$rotation
##                PC1         PC2         PC3         PC4         PC5
## m3.Ca    0.4639924 -0.32390991  0.05174966 -0.49857472  0.10898630
## m3.Mg    0.4680317 -0.22493445  0.73859014  0.35920491  0.02360238
## pH       0.5252283  0.01946359 -0.31205314 -0.40917507 -0.29772280
## Total.N -0.1011532 -0.65539834 -0.27462802  0.04606937  0.62468884
## Total.C -0.1777593 -0.62959112 -0.12420105  0.20482827 -0.71038388
## ExAl    -0.4979581 -0.13481869  0.51340332 -0.64101002 -0.06359753
##                 PC6
## m3.Ca    0.64549254
## m3.Mg   -0.23505354
## pH      -0.60853607
## Total.N -0.30404269
## Total.C  0.09995442
## ExAl    -0.23524504

The first principal component is composed primarily of soil pH related variables, Ca, Mg, and pH. The second capture N and C. These groupings (pH grouping and C and N) are not all that surprising given that our predicted soil variable set is fairly limited.

If the variables have the same sign that indicates that they are positively correlated in the principal component. In the first principal component, we see Ca, Mg and pH loading in the same direction. In the second principal component, we see N and C moving in the same direction. Ca and Mg are also associated in the same direction but to a lesser extent.

ggplot(as.data.frame(pca1$x),aes(x=PC1,y=PC2, color=d$fertility_qual)) + geom_point() + 
  labs(title = "PCA of soil attributes and farmer perceived soil fertility",
       x = "First PCA", y= "Second PCA", color="Field Quality")

When we layer farmer perceived soil fertility on top of the principal component scatter plot, no clear pattern emergres. Fields with the same fertilitiy, less and more fertility are indistinguishable by the principal components. Thus, there is not a clear profile for farmer identified healthier soil or weaker soil based on principal components.

Geographic Descriptors - PCA for soil profiles

This section will build on the principal component work above and look at improving understanding of local context to inform local adaptation. Here we will also construct soil profiles to simplify the scaling of promising products and practices to targeted locations. We don’t have a full suite of predictors so we can’t look at a comprehensive soil profile.

ggplot(as.data.frame(pca1$x),aes(x=PC1,y=PC2, color=d$district)) + geom_point() + 
  labs(title = "PCA of soil attributes and district",
       x = "First PCA", y= "Second PCA", color="District")

When we color the figure by district, we start to see a pattern emerge. However, there are too many points to clearly detect where all districts fall. Let’s instead look at the data by AEZ.

ggplot(as.data.frame(pca1$x),aes(x=PC1,y=PC2, color=d$aez)) + geom_point() + 
  labs(title = "PCA of soil attributes and AEZ",
       x = "First PCA - Ca/Mg/pH", y= "Second PCA - N and C", color="AEZ")

Coloring the points by AEZ, we see a much clearer trend. The east is to the right of our graphic, then the central plateau followed by lake Kive and then Congo Nile in green on the left. What does this mean in terms of actual soil features? Let’s look at the figure but with the variable associations layered on top. See here for documentation

library(pca3d)
pca2d( pca1, biplot= TRUE, shape= 19, col= "black"  )

It appears that the right side of the graph, the eastern AEZ, is associated with pH, Mg and Ca. This indicates that as the first principal component increases, so does the level of pH, Ca and Mg. Conversely, total N and C increase as the second principal component decreases. In terms of the AEZ figure above, this suggest that the Congo Nile AEZ has higher levels of N and C. Let’s test these hypotheses with a simple summary table:

print(kable(aggregate(d[,soilVars], by=list(d$aez), function(x){
  round(mean(x, na.rm=T),5)
})))
Group.1 m3.Ca m3.Mg pH Total.N Total.C ExAl
Central Plateau 850.4809 192.4942 5.64775 0.14560 1.90045 0.40840
Congo Nile 479.9955 124.0777 5.02279 0.16663 2.37550 1.17301
East 1369.4056 276.0773 6.01196 0.16728 2.13170 0.18844
Lake Kivu 798.7319 236.6825 5.47968 0.14884 2.05310 0.55349

Confirmed! This likely also suggests that Congo Nile is at a higher altitude than the surrounding areas and that eastern Rwanda has relatively less weathered soils compared to western.

Consequence for trial placement: The Rwanda program is already blocking trials by AEZ but these data confirm that AEZ reflects meaningfu soil variation and thus captures key growing conditions for Rwandan farmers. Blocking trials by AEZ in Rwanda will enable us to evaluate trial hypotheses in more neutral pH ranges and higher N and C conditions.

Principal Component Analysis (wet chemistry values)

wetVal <- d[complete.cases(d[,wetVars]),]

pca2 <- prcomp(wetVal[,c("wet.C.E.C", "wet.pH", "wet.Mg", "wet.Ca")], center=TRUE, scale=TRUE)

pca2 <- prcomp(wetVal[, wetVars], center=TRUE, scale=TRUE)
#plot(pca2)
ggplot(as.data.frame(pca2$x),aes(x=PC1,y=PC2, color=as.factor(wetVal$aez))) + geom_point() +
  labs(title = "Wet Chem PCA with AEZ", x = "First PC", y="Second PC",
       color="AEZ")

#pca2$rotation
pca2d(pca2, biplot= TRUE, shape= 19, col= "black")

# put pca2$x PC1 in the main data to run the fertility perception model.
#mod8 <- lm(
suppressWarnings(
stargazer(list7, type="html", 
          title = "2016A Rwanda Soil Health Baseline - Farmer Perceived Fertility (wet chem)",
          covariate.labels = c("Farmer Opinion - Less Fertile",
                               "Farmer Opinion - More Fertile"),
          dep.var.caption = "",
          dep.var.labels = "",
          column.labels = c(gsub("wet.","", wetVars)),
          notes = "Reference category is same fertility as other fields. Includes FE for cell",
          notes.align = "l",
          omit=c("cell"), out=paste(od, "rw_baseline_fertility_qual_wet.htm", sep="/"))
)
2016A Rwanda Soil Health Baseline - Farmer Perceived Fertility (wet chem)
pH EC..S. P K Ca Mg S Na Fe Mn B Cu Zn C.E.C TN C.N Exch..Acidity Acid.Saturation Exch.Al OC
(1) (2) (3) (4) (5) (6) (7) (8) (9) (10) (11) (12) (13) (14) (15) (16) (17) (18) (19) (20)
Farmer Opinion - Less Fertile -0.227 1.930 -9.111 -44.922 -23.678 -22.846 2.274 -0.079 22.320 -29.748 -0.064 -0.317 -0.224 0.138 0.004 -0.277 0.343** 9.657** 0.268** 0.153
(0.147) (11.629) (19.670) (28.712) (150.522) (36.856) (2.236) (2.929) (17.023) (21.301) (0.222) (0.337) (0.764) (1.126) (0.008) (0.627) (0.160) (3.686) (0.124) (0.135)
Farmer Opinion - More Fertile -0.109 -7.216 -16.525 -69.198** -187.493 -57.688 2.223 -0.854 -12.790 -10.391 -0.069 -0.278 -1.040 -1.369 -0.016* 0.287 0.125 3.831 0.091 -0.192
(0.174) (13.775) (23.740) (34.653) (181.669) (44.482) (2.699) (3.535) (20.546) (25.708) (0.268) (0.407) (0.922) (1.359) (0.010) (0.757) (0.193) (4.449) (0.150) (0.163)
Constant 6.225*** 127.000*** 8.910 213.000*** 1,745.000*** 435.000*** 15.700** 24.300*** 61.450 194.000*** 0.415 2.670*** 4.415** 15.650*** 0.195*** 11.750*** 0.140 1.100 0.023 2.335***
(0.406) (32.067) (55.599) (81.159) (425.475) (104.178) (6.321) (8.278) (48.119) (60.210) (0.627) (0.953) (2.160) (3.182) (0.023) (1.772) (0.451) (10.419) (0.351) (0.383)
Observations 237 237 244 244 244 244 244 244 244 244 244 244 244 244 244 244 244 244 244 244
R2 0.742 0.580 0.702 0.653 0.760 0.755 0.729 0.705 0.769 0.688 0.525 0.693 0.540 0.752 0.773 0.671 0.775 0.777 0.797 0.731
Adjusted R2 0.425 0.064 0.348 0.240 0.475 0.463 0.407 0.354 0.494 0.318 -0.039 0.329 -0.006 0.457 0.504 0.280 0.506 0.511 0.555 0.412
Residual Std. Error 0.574 (df = 106) 45.349 (df = 106) 78.629 (df = 111) 114.776 (df = 111) 601.713 (df = 111) 147.330 (df = 111) 8.940 (df = 111) 11.707 (df = 111) 68.051 (df = 111) 85.150 (df = 111) 0.887 (df = 111) 1.347 (df = 111) 3.055 (df = 111) 4.501 (df = 111) 0.033 (df = 111) 2.506 (df = 111) 0.638 (df = 111) 14.735 (df = 111) 0.496 (df = 111) 0.541 (df = 111)
F Statistic 2.344*** (df = 130; 106) 1.125 (df = 130; 106) 1.983*** (df = 132; 111) 1.580*** (df = 132; 111) 2.664*** (df = 132; 111) 2.586*** (df = 132; 111) 2.264*** (df = 132; 111) 2.010*** (df = 132; 111) 2.796*** (df = 132; 111) 1.857*** (df = 132; 111) 0.930 (df = 132; 111) 1.902*** (df = 132; 111) 0.988 (df = 132; 111) 2.551*** (df = 132; 111) 2.867*** (df = 132; 111) 1.715*** (df = 132; 111) 2.888*** (df = 132; 111) 2.923*** (df = 132; 111) 3.293*** (df = 132; 111) 2.290*** (df = 132; 111)
Note: p<0.1; p<0.05; p<0.01
Reference category is same fertility as other fields. Includes FE for cell

District and cell level summaries of soil and managment practices

dist.sum <- aggregate(d[,out.list], by=list(d$district), function(x){
  return(c(
    paste(
      round(mean(x, na.rm=T),3), " (", round(median(x, na.rm=T),3), ")", 
      " (", round(sd(x, na.rm=T),2), ")", sep = ""
    )
    ))
})
write.csv(dist.sum, file=paste(od, "district covariate summary.csv", sep = "/"))
cell.sum <- aggregate(d[,out.list], by=list(d$cell), function(x){
  return(c(
    paste(
      round(mean(x, na.rm=T),3), " (", round(median(x, na.rm=T),3), ")", 
      " (", round(sd(x, na.rm=T),2), ")", sep = ""
    )
    ))
})
write.csv(cell.sum, file=paste(od, "cell covariate summary.csv", sep = "/"))

Female farmers farming poorer soils?

remove <- "female"
genderBalance <- out.list[!out.list %in% remove]

equal <- do.call(rbind, lapply(genderBalance, function(x){
    
    out <- t.test(d[,x] ~ d[,"female"], data=d)
    tab <- data.frame(out[[5]][[2]],out[[5]][[1]], out[3])
    tab[,1:2] <- round(tab[,1:2],3)
    names(tab) <- c(names(out[[5]]), "pvalue")
    #tab[,3] <- p.adjust(tab[,3], method="holm")
    #tab[,3] <- ifelse(tab[,3] < 0.001, "< 0.001", round(tab[,3],3))
    #print(tab)
    return(tab)
  
}))

rownames(equal) <- NULL

# order variables 
equal$pvalue <- p.adjust(equal$pvalue, method="fdr")
rownames(equal) <- genderBalance
equal <- equal[order(equal$pvalue),]

equal$pvalue <- ifelse(equal$pvalue < 0.001, "< 0.001", round(equal$pvalue,3))
colnames(equal) <- c("Male Farmers","Female Farmers", "p-value")    
write.csv(equal, file=paste(od, "female farmer status.csv", sep = "/"), row.names=T)

Propensity Score Matching

We need to do a more rigorous job of accounting for differences between Tubura farmers and identified control farmers. Execute propensity score matching (PSM) to identify control farmers that overlap with Tubura farmers with regard to their likelihood of being a Tubura farmer.

psmVars <- paste(c("female", "age", "hhsize", "total.seasons",
                   "cows", "goats", "chickens", "pigs", "sheep"),
                   collapse=" + ")

reg <- glm(as.formula(paste("client ~", psmVars, sep="")), 
           family= binomial(link="logit"), data=d)
summary(reg)    
## 
## Call:
## glm(formula = as.formula(paste("client ~", psmVars, sep = "")), 
##     family = binomial(link = "logit"), data = d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.5522  -0.8456   0.0376   0.9250   2.0863  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    0.408940   0.203191   2.013   0.0442 *  
## female        -0.749686   0.097393  -7.698 1.39e-14 ***
## age           -0.024394   0.003378  -7.221 5.16e-13 ***
## hhsize         0.037275   0.022980   1.622   0.1048    
## total.seasons  0.530099   0.028803  18.404  < 2e-16 ***
## cows           0.007891   0.017934   0.440   0.6599    
## goats         -0.004426   0.030020  -0.147   0.8828    
## chickens       0.023011   0.017794   1.293   0.1959    
## pigs           0.052035   0.057838   0.900   0.3683    
## sheep          0.053211   0.070949   0.750   0.4533    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3386.7  on 2442  degrees of freedom
## Residual deviance: 2566.6  on 2433  degrees of freedom
## AIC: 2586.6
## 
## Number of Fisher Scoring iterations: 5
# summarize predicted probabilities
pr <- data.frame(pr_score = predict(reg, type='response'), treat = d$client)

# graph
psmGraph <- ggplot() + geom_histogram(data=subset(pr, pr$treat==1), aes(x = pr_score, y=..count.., fill=as.factor(treat)), bins=80, position = "identity") +
    geom_histogram(data=subset(pr, pr$treat==0), aes(x=pr_score, y=-..count.., fill=as.factor(treat)), bins=80, position = "identity") +
    scale_y_continuous(limits=c(-150,150)) + 
  labs(title ="PSM score overlap", x = "PSM score", y="Farmer count",
       fill="Tubura/Control")

print(psmGraph)

pdf(file=paste(od, "rw_baseline_psm_overlap.pdf", sep="/"), height=8.5, width=11)
print(psmGraph)
dev.off()
## quartz_off_screen 
##                 2

Interpretation We have some overlap but it’s clear that Tubura farmers occupy a different range than the identified control farmers. Let’s continue with the PSM matching process but restrict ourselves to Tubura and control farmers that meet a certain PSM matching radius.

Notes on PSM process:

We have to indicate a variable for matching. I’m choosing pH as we know it to be an issue in most of our operating areas and addressing soil acidity has numerous residual benefits to soil health. I’ll want to do this for all soil outcomes, however.

  • As a next step, I’d like to identify the matched subset and then regress Tubura tenure on soil outcomes. Does that make sense?
  • I chose a caliper of 0.25. I should review this process with Maya to make certain I’m following best practice. I sort of just pulled that sd figure out of the sky.
  • An initial check of the match quality using a common model indicates that the matches are poor.
# PSM prep
tr <- cbind(d$client)
x <- d[, unlist(strsplit(psmVars, " + ", fixed=T))]
y <- soilVars


# PSM
set.seed(20161102)
m <- lapply(y, function(response){

  suppressWarnings(
  mod <- Match(Y = d[,response], Tr = tr, X = reg$fitted, ties=FALSE, replace=FALSE, caliper=0.25, estimand = "ATE")    
  )
  matchRes <- MatchBalance(tr ~ d[,response], match.out=mod, nboots=500, data=d, print.level = 0)
  return(list(mod, matchRes))
})
#lapply(m, summary)

Now check the naive model approach for PSM balance.

matchRes <- do.call(rbind, lapply(1:length(m), function(model){
  val <- as.data.frame(cbind(
    standard.diff=m[[model]][[2]]$AfterMatching[[1]]$sdiff, 
    var.ratio = m[[model]][[2]]$AfterMatching[[1]]$var.ratio,
    sdiff.adj = m[[model]][[2]]$AfterMatching[[1]]$sdiff/100))
  return(val)
}))
rownames(matchRes) <- y
print(kable(matchRes))
standard.diff var.ratio sdiff.adj
m3.Ca -1.5998283 0.8574354 -0.0159983
m3.Mg -12.0950035 0.7265264 -0.1209500
pH 6.4227217 1.0777240 0.0642272
Total.N -2.1315722 0.9573257 -0.0213157
Total.C -2.1472466 1.0490227 -0.0214725
ExAl -0.8045446 1.0180995 -0.0080454

Interpretation: We want to see standard mean differences less than the absolute value of 0.25 (or 0.1 if we’re being conservative) and variance ratios close to 1 but certainly between 0.5 and 2.

According to the CRAN summary, sdiff is the standardized difference between the treatment and control units multiplied by 100. If I divide by 100, the values come much closer to reasonable value.

Let the PSM models vary

The common model approach doesn’t seem to be working for any of the variables. I’m going to rework the modeling approach so we can fit different models for each outcome upon which we’re trying to match.

d$age2 <- d$age^2
d$hhsize_age <- d$hhsize*d$age
d$hhsize2 <- d$hhsize^2

coreVars = c("female", "age", "hhsize", "own", "as.factor(cell)", "cows", "goats", "chickens", "pigs", "sheep")

psmList <- list(
  list(tr = "client",
       psmVars = paste(coreVars,
                   collapse=" + "),
       y="m3.Ca"),
  list(tr = "client",
       psmVars = paste(coreVars,
                   collapse=" + "),
       y="m3.Mg"),
  list(tr = "client",
       psmVars = paste(coreVars,
                   collapse=" + "),
       y="pH"),
  list(tr = "client",
       psmVars = paste(coreVars,
                   collapse=" + "),
       y="Total.N"),
  list(tr = "client",
       psmVars = paste(coreVars,
                   collapse=" + "),
       y="Total.C"),
  list(tr = "client",
       psmVars = paste(coreVars,
                   collapse=" + "),
       y="ExAl"),
  list(tr = "client",
       psmVars = paste(coreVars,
                   collapse=" + "),
       y="n_season_fert"),
  list(tr = "client",
       psmVars = paste(coreVars,
                   collapse=" + "),
       y="n_season_compost"),
  list(tr = "client",
       psmVars = paste(coreVars,
                   collapse=" + "),
       y="n_seasons_leg_1"),
  list(tr = "client",
       psmVars = paste(coreVars,
                   collapse=" + "),
       y="n_season_fallow"),
  list(tr = "client",
       psmVars = paste(coreVars,
                   collapse=" + "),
       y="logFert"),
  list(tr = "client",
       psmVars = paste(c(coreVars, "age2",
                   "hhsize2"),
                   collapse=" + "),
       y="n_seasons_leg_2"),
  list(tr = "client",
       psmVars = paste(c(coreVars, "age2",
                   "hhsize2"),
                   collapse=" + "),
       y="n_season_lime"),
  list(tr = "client",
       psmVars = paste(c("female", "age", "hhsize", "own",
                         "cows", "goats", "chickens", "pigs", "sheep",
                         "as.factor(district)", "age2", "hhsize2",
                         "hhsize_age"),
                   collapse=" + "),
       y="fert1.are"),
  # list(tr = "client",
  #      psmVars = paste(c("female", "age", "hhsize","as.factor(district)",
  #                  "cows", "goats", "chickens", "pigs", "sheep","age2", "hhsize2",
  #                        "hhsize_age"),
  #                  collapse=" + "),
  #      y="fert2.are"),
  list(tr = "client",
       psmVars = paste(c("female", "age", "hhsize","as.factor(district)",
                   "cows", "goats", "chickens", "pigs", "sheep","age2", "hhsize2",
                         "hhsize_age"),
                   collapse=" + "),
       y="compost.are")
  
)

# fertilizer application in 15B has missing values at the cell level so i include
# a district level control instead. We don't get a good fit with the current model.
# adjust!
# aggregate(d$fert1.are, by=list(d$client, d$cell), FUN=mean, na.rm=T)
# lime.are is also being finicky



# PSM
set.seed(20161102)
m <- lapply(psmList, function(listInput){

  naCheck <- unlist(strsplit(gsub("\\+", "", as.vector(listInput$psmVars))," ", fixed=TRUE))
  naCheck <- naCheck[-which(naCheck=="")]
  
  # keep complete cases of outcome variable
  k <- d[complete.cases(d[,listInput$y]),]
  k <- k[complete.cases(k[,listInput$y]),]
  
  # run glm regression:
  reg <- glm(as.formula(paste(listInput$tr, "~", listInput$psmVars, sep="")),  family= binomial(link="logit"), data=k)
  
  suppressWarnings(
  mod <- Match(Y = k[,listInput$y], Tr = k[,listInput$tr], X = reg$fitted, ties=FALSE, replace=FALSE, caliper=0.25, estimand = "ATE")   
  )
  matchRes <- MatchBalance(k[,listInput$tr] ~ k[,listInput$y], match.out=mod, nboots=500, data=k, print.level = 0)
  #print(listInput$y)
  return(list(mod, matchRes))
  
})

The models can now vary by outcome. Let’s see if we can improve our results.

matchRes <- do.call(rbind, lapply(1:length(m), function(model){
  val <- as.data.frame(cbind(
    standard.diff=m[[model]][[2]]$AfterMatching[[1]]$sdiff, 
    var.ratio = m[[model]][[2]]$AfterMatching[[1]]$var.ratio,
    sdiff.adj = m[[model]][[2]]$AfterMatching[[1]]$sdiff/100))
  return(val)
}))

namesInput <- NULL
for(i in 1:length(psmList)){
  namesInput[i] <- psmList[[i]]$y
}
rownames(matchRes) <- namesInput
print(kable(matchRes))
standard.diff var.ratio sdiff.adj
m3.Ca -4.7500746 0.9005991 -0.0475007
m3.Mg -6.3887895 0.9215206 -0.0638879
pH -2.9860193 1.0181086 -0.0298602
Total.N -6.0596975 0.9986588 -0.0605970
Total.C -2.3798262 1.0594167 -0.0237983
ExAl 4.1895526 1.0408160 0.0418955
n_season_fert 78.6639739 3.3582015 0.7866397
n_season_compost 9.4102562 0.9632805 0.0941026
n_seasons_leg_1 5.4860055 0.8773639 0.0548601
n_season_fallow 6.2184929 1.2947263 0.0621849
logFert 102.5919067 1.7352434 1.0259191
n_seasons_leg_2 -12.0195420 0.9363917 -0.1201954
n_season_lime 14.0155855 1.5306613 0.1401559
fert1.are 0.5627194 1.3655780 0.0056272
compost.are 8.5920038 1.3089388 0.0859200

Interpretation If I divide the standardized mean differences by 100, we meet the balance criteria of the standardized mean difference being close to 0 and the variance being close to 1. Let’s print out the model results to see how Tubura and control farmers compare on key soil meterics. These results should supercede the naive balance tables presented above.

We achieve acceptable balance for the soil attributes but we don’t for seasons of fertilizer use. This is isn’t entirely unexpected given that Tubura’s primary service is providing fertilizer inputs and training.

coefTable <- do.call(rbind, lapply(1:length(m), function(model){
  beta = round(m[[model]][[1]]$est.noadj,3)
  mean.Tr = round(m[[model]][[2]]$AfterMatching[[1]][[3]], 2)
  mean.Co = round(m[[model]][[2]]$AfterMatching[[1]][[4]], 2)
  pval = m[[model]][[2]]$AfterMatching[[1]][[10]][[3]] # p.value
  #pval = (1 - pnorm(abs(m[[model]][[1]]$est/m[[model]][[1]]$se.standard))) * 2
  pval = ifelse(pval < 0.001, "0.001", round(pval, 3))
  
  res = data.frame(beta, mean.Tr, mean.Co, pval)
  return(res)
}))
row.names(coefTable) <- namesInput
coefTable$pval.adj <- round(p.adjust(coefTable$pval, method="fdr"),3)
print(kable(coefTable))
beta mean.Tr mean.Co pval pval.adj
m3.Ca -25.622 854.75 880.38 0.156 0.213
m3.Mg -7.764 204.41 212.18 0.056 0.105
pH -0.017 5.52 5.54 0.346 0.399
Total.N -0.002 0.16 0.16 0.065 0.108
Total.C -0.012 2.11 2.12 0.458 0.491
ExAl 0.026 0.60 0.58 0.189 0.236
n_season_fert 2.547 3.40 0.85 0.001 0.004
n_season_compost 0.354 5.96 5.60 0.004 0.012
n_seasons_leg_1 0.145 2.25 2.11 0.096 0.144
n_season_fallow 0.105 0.67 0.56 0.038 0.081
logFert 0.821 1.19 0.37 0.001 0.004
n_seasons_leg_2 -0.384 2.64 3.02 0.001 0.004
n_season_lime 0.096 0.23 0.13 0.001 0.004
fert1.are 0.009 1.06 1.06 0.947 0.947
compost.are 9.912 54.14 44.23 0.006 0.015

Calculate farmers under soil health thresholds - table 1

thresh <- d %>% group_by(client) %>% dplyr::summarize(
  count = n(),
  ph = sum(pH<5.8),
  carbon = sum(Total.C < 2),
  nitrogen = sum(Total.N < 0.1),
  calcium = sum(m3.Ca < 720),
  magnesium = sum(m3.Mg < 100)
  #aluminum = sum(ExAl)
) %>% mutate(
  under.ph = paste(paste(round(ph/count,4)*100, "%", sep=""), " (", ph, ")", sep=""),
  under.carbon = paste(paste(round(carbon/count,4)*100,"%", sep=""), " (", carbon, ")", sep=""),
  under.nitrogen = paste(paste(round(nitrogen/count,4)*100, "%", sep=""), " (", nitrogen, ")", sep=""),
  under.calcium = paste(paste(round(calcium/count,4)*100, "%", sep=""), " (", calcium, ")", sep=""),
  under.mag = paste(paste(round(magnesium/count,4)*100,"%", sep=""), " (", magnesium, ")", sep="")
) %>% as.data.frame() 

thresh <- thresh[, c("client", names(thresh)[grep("under", names(thresh))])]
thresh <- t(thresh)
colnames(thresh) = thresh[1, ] # the first row will be the header
colnames(thresh) = c("non-client", "client")
thresh = thresh[-1, ]

write.csv(thresh, file=paste(od, "table1_rw_thresholds.csv", sep = "/"), row.names = T)

Table 1:3 coefficients

#
write.csv(coefTable, file=paste(od, "psm coefficients.csv", sep = "/"),
          row.names = T)

# sort by the order Eric wants
t1order <- c("pH", "Total.C", "Total.N", "m3.Ca", "m3.Mg", "ExAl")
table1vars <- paste(t1order, collapse = "|")
table1 <- coefTable[grep(table1vars, rownames(coefTable)), ]
table1 <- table1[order(match(rownames(table1), t1order)),]

write.csv(table1, file=paste(od, "psm coefficients ordered for ES.csv", sep = "/"))

# 11/17 added lime
t2order <- c("n_season_fert", "n_season_compost", "n_season_lime", "n_season_fallow", "n_seasons_leg_1", "n_seasons_leg_2")
table2vars <- paste(t2order, collapse = "|")
table2 <- coefTable[grep(table2vars, rownames(coefTable)), ]
table2 <- table2[order(match(rownames(table2), t2order)),]

write.csv(table2, file=paste(od, "psm coefficients ordered for ES_agprac.csv", sep = "/"))

#table 3
t3order <- c("fert1.are", "compost.are")
table3vars <- paste(t3order, collapse = "|")
table3 <- coefTable[grep(table3vars, rownames(coefTable)), ]
table3 <- table3[order(match(rownames(table3), t3order)),]
write.csv(table3, file=paste(od, "psm coefficients table3.csv", sep = "/"))

Interpretation: Propensity score matching gives us a comparable treatment and control group. The table above shows that after matching on those characteristics, there are effectively no differences between One Acre Fund farmer and Tubura farmers on soil attributes. The unadjusted p-values show 1AF farmers to have slow levels of soil nitrogen but this finding disappears if we account for running multiple matching models.

When we expand the outcome variable set to include practice variables, we first no longer get a good propensity score match for all variables.

  • Tubura participation so strongly overlaps with fertilizer use that the comparison group is no longer comparable. The resulting model shows a significant difference in seasons of fertilizer use.
  • Tubura farmers have used compost for 0.3 more seasons than non-Tubura farmers. We don’t observe a difference in N or C levels in the soil attributes however.
  • We don’t see a difference between Tubura and non-Tubura farmers on seasons of legumes being the primary crop in the study plot.

Study group balance post-match

See here for some guidance on hwo to use weights to reconstruct the group balance following the matches.

See here for weighted t.test documentation

suppressMessages(library(weights))
tableVars <- c("age", "female", "hhsize", "own")

postMatch <- do.call(rbind, lapply(1:length(m), function(model){
  
  innerPost <- do.call(rbind, lapply(tableVars, function(x){
  
  mean.t = weighted.mean(d[m[[model]][[1]]$index.treated,][,x], m[[model]][[1]]$weights)
  mean.c = weighted.mean(d[m[[model]][[1]]$index.control,][,x], m[[model]][[1]]$weights)
  
  # combined data
  dm <- as.data.frame(rbind(d[m[[model]][[1]]$index.treated,], 
              d[m[[model]][[1]]$index.control,]))
  
  test = wtd.t.test(d[m[[model]][[1]]$index.treated,][,x], 
                    d[m[[model]][[1]]$index.control,][,x],
                    weight=m[[model]][[1]]$weights,
                    samedata=TRUE)
  
  return(data.frame(model.num = model, 
                    outcome=x, 
                    tr=mean.t, 
                    contr = mean.c,
                    pval = test$coefficients[3][[1]]))
  }))
  
  return(innerPost)
}))
write.csv(postMatch, file=paste(od, "rw post match covars.csv", sep = "/"),
          row.names = F)

Models with PSM matched farmers

Per Robert’s suggestion, now that we’ve matched Tubura and non-Tubura farmers, let’s assess the severity of Tubura tenure on key soil health outcomes.

tenureTab.add <- lapply(1:length(m), function(model){
  
  dm <- as.data.frame(rbind(d[m[[model]][[1]]$index.treated,], 
              d[m[[model]][[1]]$index.control,]))
  dm$client_tenure <- dm$client*dm$total.seasons
  mod <- lm(as.formula(paste("dm[,psmList[[model]]$y] ~", "total.seasons + as.factor(cell)", sep ="")), data=dm)
  return(mod)
})

# add tenured>=3 to d if we want to run this as a binary comparison

# tenureTab.binary <- lapply(1:length(m), function(model){
#   
#   dm <- as.data.frame(rbind(d[m[[model]][[1]]$index.treated,], 
#               d[m[[model]][[1]]$index.control,]))
#   dm$client_tenure <- dm$client*dm$total.seasons
#   mod <- lm(as.formula(paste("dm[,psmList[[model]]$y] ~", "tenured + as.factor(cell)", sep ="")), data=dm)
#   return(mod)
# })
# modNames <- c("Calcium", "Magnesium", "pH", "Nitrogen", "Carbon", "Seasons fertilizer", "Seasons compost", "Seasons legumes", "Seasons fallow", "Fertilizer (log)", "Season Sec. legumes", "Seasons Lime", "Fertilizer/Are", "Compost/are")

modNames <- unlist(lapply(psmList, function(x){
  return(x$y)
}))
suppressWarnings(
stargazer(tenureTab.add, type="html", 
          title = "2016A Rwanda Soil Health Baseline - PSM Tenure",
          covariate.labels = "One Acre Fund Tenure", 
          dep.var.labels = "",
          #column.labels = modNames,
          #column.labels = c(gsub("m3.", "", as.vector(namesInput))),
          notes = "Includes FE for cell",
          omit=c("cell"), out=paste(od, "rw_baseline_matched_tenure.htm", sep="/"))
)
plm.tenure <- function(x){
  
  intercept = x$coefficients[[1]]
  beta = round(x$coefficients[[2]],3)
  int.pval = summary(x)$coefficients[1,4]
  int.pval = ifelse(int.pval < 0.001, "< 0.001", round(as.numeric(int.pval),3))
  beta.pval = summary(x)$coefficients[2,4]
  beta.pval = ifelse(beta.pval < 0.001, "< 0.001", round(as.numeric(beta.pval),3))
  res = data.frame(intercept, int.pval, beta, beta.pval, stringsAsFactors = F)
  
  return(res)  
}

tenure.reg <- do.call(rbind, lapply(tenureTab.add, function(x){
  plm.tenure(x)
}))

rownames(tenure.reg) <- modNames
t6order <- c("cows", "pigs", "sheep", "goats", "chickens")
table6vars <- paste(t6order, collapse = "|")
rw.table6 <- output[grep(table6vars, rownames(output)), ]
rw.table6 <- rw.table6[order(match(rownames(rw.table6), t4order)),]

ESorder <- c("pH", "Total.C", "Total.N", "m3.Ca", "m3.Mg", "ExAl", "n_season_fert",
             "logFert", "n_season_compost", "n_seasons_leg_1", "n_season_fallow")
ESorder.grep <- paste(ESorder, collapse = "|")
tenure.reg <- tenure.reg[grep(ESorder.grep,rownames(tenure.reg)),]
tenure.reg <- tenure.reg[order(match(row.names(tenure.reg), ESorder)),]

write.csv(tenure.reg, file=paste(od, "table13_reg.csv", sep = "/"), row.names=T)

Interpretation: Using a PSM matched sample, the models above assess the effects of additional years of farming with Tubura. Numerous control farmers have also been Tubura farmers in previous seasons. Thus, I’m keeping the model simple instead of adding a client*tenure interaction. We can easily test that as well though.

Soil Clusters

Heirarchical Clustering

I draw from Robert’s example in the BGMS soil response notebook.

Scale soil variables to remove units as an issue.

scaledSoil <- scale(d[,soilVars])
clustMethods <- c("ward.D2", "single", "complete", "average", "mcquitty", "median", "centroid")
invisible(sapply(clustMethods, FUN=function(method) {
    plot(hclust(dist(scaledSoil), method=method), main=method, ylab="", label=FALSE)
}))

Note: I’m honestly not certain what we’re looking for in these outputs. Look for a balanced tree? Ward looks pretty balanced.

hc <- hclust(dist(scaledSoil), method="ward.D2")
d$hc4 <- cutree(hc, k=4)
hcCentroids <- aggregate(list(d[,soilVars]), by=list(cluster=d$hc4), FUN=mean)
hcTable <- rbind(t(table(d$hc4)),
      t(round(hcCentroids, 2)))
print(kable(hcTable))
1 2 3 4
861.00 443.00 922.00 217.00
cluster 1.00 2.00 3.00 4.00
m3.Ca 665.28 380.73 1013.54 2151.18
m3.Mg 169.97 89.04 247.90 434.78
pH 5.28 4.81 5.98 6.22
Total.N 0.16 0.18 0.14 0.19
Total.C 2.11 2.68 1.74 2.59
ExAl 0.64 1.61 0.14 0.11

K-means clustering

K-means clustering is a typical unsupervised learning algorithm. We start by identifying the number of clusters we should tell the formula to look for.

numClusters <- 2:10
plot(numClusters,
     sapply(numClusters, FUN=function(k) { sum(kmeans(scaledSoil, centers=k, nstart=20, iter.max=20)$withinss) }),
     type="b", ylab="average within cluster sum of squared error")

We’re looking for the bend in the graph. It seems to come at 3 but we can try both 3 and 4 and see how they match with our understanding of AEZ and environmental conditions.

set.seed(20161125)
clusters <- 4
d$km4 <- kmeans(scaledSoil, centers=clusters, nstart=20, iter.max=20)$cluster
kmCentroids <- aggregate(list(d[,soilVars]), by=list(cluster=d$km4), FUN=mean)
kmTable <- rbind(t(table(d$km4)),
      t(round(kmCentroids, 2)))
print(kable(kmTable))
1 2 3 4
823.00 471.00 673.00 476.00
cluster 1.00 2.00 3.00 4.00
m3.Ca 930.87 1755.64 535.91 397.22
m3.Mg 233.95 371.23 143.52 93.95
pH 5.93 6.12 5.18 4.83
Total.N 0.13 0.18 0.15 0.18
Total.C 1.70 2.41 2.01 2.68
ExAl 0.14 0.12 0.72 1.58

These clusters are roughly equally sized. Robert used another method, partitioning around Mediods, to better account for outliers that were causing the algorithm to create a small additional cluster.

Give the clusters brief descriptions

d$km4 <- factor(d$km4, labels=c("pH=5.9, C=1.7", 
                                "pH=6.1,C=2.4", 
                                "pH=5.1, C=2",
                                "pH=4.8, C=2.6"))

Mappaing the location of the clusters

suppressMessages(library(ggmap))
qmplot(lon, lat, data=d, color=d$km4)
## Using zoom = 9...
## Map from URL : http://tile.stamen.com/toner-lite/9/297/258.png
## Map from URL : http://tile.stamen.com/toner-lite/9/298/258.png
## Map from URL : http://tile.stamen.com/toner-lite/9/299/258.png
## Map from URL : http://tile.stamen.com/toner-lite/9/297/259.png
## Map from URL : http://tile.stamen.com/toner-lite/9/298/259.png
## Map from URL : http://tile.stamen.com/toner-lite/9/299/259.png
## Map from URL : http://tile.stamen.com/toner-lite/9/297/260.png
## Map from URL : http://tile.stamen.com/toner-lite/9/298/260.png
## Map from URL : http://tile.stamen.com/toner-lite/9/299/260.png
## Warning: Removed 84 rows containing missing values (geom_point).

#qmplot(lon, lat, data=d, color=as.factor(hc4))
crdref <- CRS('+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0')
e <- d[!is.na(d$lon),]
ss <- SpatialPointsDataFrame(coords = e[, c("lon", "lat")], data=e, proj4string = crdref)

pal <- colorFactor("RdYlBu", domain=unique(ss$km4))

leaflet() %>% addTiles() %>%
  setView(lng=rwanda$longitude, lat=rwanda$latitude, zoom=8) %>%
  addCircleMarkers(lng=ss$lon, lat=ss$lat, 
                   #radius= ifelse(ss$km4==1, 10,6),
                   color = pal(ss$km4), clusterOptions = markerClusterOptions(disableClusteringAtZoom=11, spiderfyOnMaxZoom=FALSE)) %>% 
  addLegend(pal = pal, values=unique(ss$km4), title = "Clusters ")

Soil clusters by altitude and AEZ

ggplot(d, aes(x=km4, y=alt)) + geom_boxplot()
## Warning: Removed 84 rows containing non-finite values (stat_boxplot).

Soil clusters by altitude and rainfall

Try this when we have aWhere data

Compare clusters to AEZ

print(kable(table(d$km4, d$aez)))
Central Plateau Congo Nile East Lake Kivu
pH=5.9, C=1.7 299 63 226 235
pH=6.1,C=2.4 70 9 294 98
pH=5.1, C=2 189 200 86 198
pH=4.8, C=2.6 38 327 10 101

We’re not seeing close alignment between clusters and AEZ. This is probably because AEZ are a mix of altitude, rainfall, temperature and soil. When we have weather data we can try to cluster again and see if we’re more closely aligned.

Alternatively, this could be initial evidence that the AEZ designations are not capturing and separating variation very well. I think we need to start with more data though.

Mapping

Soil health maps for soil health study areas

#crdref <- CRS('+proj=longlat +datum=WGS84')
crdref <- CRS('+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0')
e <- d[!is.na(d$lon),]
ss <- SpatialPointsDataFrame(coords = e[, c("lon", "lat")], data=e, proj4string = crdref)

rw <- getData("GADM", country='RW', level=3, path = "/Users/mlowes/drive/optimized_agronomy/soil/soil_health_study/data") # the higher the number, the higher the resolution

#ext.rw.ss matches points with spatial polygons in rw
ext.rw.ss <- extract(rw[, "NAME_3"], ss)
## Loading required namespace: rgeos
ss$spatialname <- ext.rw.ss$NAME_3

frw <- fortify(rw, region="NAME_3")
ss.soil <- aggregate(ss@data[,soilVars], by=list(ss@data$spatialname), function(x){
  mean(x, na.rm=T)
})

plotReady <- dplyr::left_join(frw, ss.soil, by=c("id"="Group.1"))

Plot simple summary of soil characteristics

Consider revising these maps to a smaller greographic unit. Add the name of the location for uninitiated users.

library(RColorBrewer)
mapList <- list()
for(i in 1:length(soilVars)){
mapRes <-  ggplot(plotReady, aes(x=long, y=lat, group=group)) + geom_path() + 
  geom_polygon(aes(fill=plotReady[,soilVars[i]])) + 
  scale_fill_gradientn(colours = rev(brewer.pal(9,"Reds")), # define colors
                       name = soilVars[i],
                       guide = guide_colorbar(legend.direction = "vertical")) + 
  theme_bw() + 
  labs(title=paste("Rwanda long term soil health baseline - 2016", soilVars[i], sep= " "), x = "Longitude", y="Latitude")
mapList[[i]] <- mapRes
print(mapRes)
} 

# This is a small experiment to combine raster (spdf) and leaflet and be able to access the data in the raster interactively.
#mapLayer <- sp::merge(rw, ss.soil, by.x="NAME_3", by.y="Group.1")

# fill = T, fillOpacity = 0.7, fillColor = d.fill, 
#         stroke = T, color = "white", weight = 2, dashArray = 3, 
#         opacity = 0.5, popup = county.tt(d)

# leaflet(mapLayer) %>% addTiles() %>%
#   setView(lng=rwanda$longitude, lat=rwanda$latitude, zoom=8) %>%
#   addPolygons(
#     stroke = TRUE, opacity=0.2, smoothFactor = 0.5, 
#     fillColor=mapLayer$pH, fillOpacity = 0.5)
pdf(file=paste(md, "rw_shs_baseline_soil_maps.pdf", sep = "/"), width=11, height=8.5)
for(i in 1:length(mapList)){
print(mapList[[i]])
}  
dev.off()
## quartz_off_screen 
##                 2

Interpolations of soil health values

Interpolate soil health values for full operating area using soil health study values. We want to eventually add all Rwandan soil values into a single dataset to update and hone these values. See here for more guidanace

note:

  • Layer on the Tubura sites to the interpolated map.
  • Use leaflet so you can zoom in and out more easily.
  • Make it a raster layer that you can click on understand the values at different locations and the name of the location. I’ll need to make this a shiny app to have that functionality

The code below will run 5 K-fold cross validation to compare interpolation models. The output will be fed into the interpolate leaflet code below.

Check that I’m handling the projection correctly with Robert

# proj4string(ss) <- CRS("+init=epsg:4326")
# ss <- spTransform(ss, CRS=("+proj=utm +zone=36N +datum=WGS84"))
# root mean sq error for evaluating models
RMSE <- function(observed, predicted) {
  sqrt(mean((predicted - observed)^2, na.rm=TRUE))
}

# set k folds to 5
set.seed(20161030)
nfolds <- 5
k <- kfold(ss, nfolds) # from dismo

# cross validation of models
ensrmse <- tpsrmse <- idwrmse <- rep(NA, 5) # assing multiple objects at once

cv <- function(x) {
  
  for(i in 1:nfolds) {
    train <- ss[k!=i,]
    test <- ss[k==i,]

  train <- train[!is.na(train@data[,x]),]
  
  # inverse distance weights
  m <- gstat(formula=as.formula(paste(x, '~ 1')), locations=train)
  p1 <- predict(m, newdata=test, debug.level=0)$var1.pred
  idwrmse[i] <-  RMSE(test@data[,x], p1) #idw rsme
  
  # krieging
  # m <- autoKrige(formula=as.formula(paste(x, "~ 1")), input_data = train)
  # p2 <- predict(m, newdata=test, debug.level=0)$var1.pred
  # krigrmse[i] <-  RMSE(test$OZDLYAV, p2)
  
  # thin plate spline
  m <- Tps(coordinates(train), train@data[,x]) 
  p3 <- predict(m, coordinates(test))
  tpsrmse[i] <-  RMSE(test@data[,x], p3)
  
  w <- c(idwrmse[i], tpsrmse[i]) # combine the rmse
  weights <- w / sum(w) # weight them
  ensemble <- p1 * weights[1] + p3 * weights[2] 
  # multiply predictions by weights
  ensrmse[i] <-  RMSE(test@data[,x], ensemble) # truly an ensemble result?
  }
  
  output <- rbind(idwrmse, tpsrmse, ensrmse)
  return(output)
  
}

Now loop over the variables of interest where x is the soilVar variable.

output <- lapply(soilVars, function(x){
  ini <- data.frame(cv(x))
  ini$ave <- apply(ini[,1:5], 1, function(y){mean(y, na.rm=T)})
  res <- paste("Best model is ", row.names(ini[which.min(ini$ave),]),  sep = "")
  return(list(ini, res))
})

Interpolated soil maps

r <- raster(res=1/12)
r <- crop(r, floor(extent(rw)))

maps <- lapply(soilVars, function(x){
  m <- Tps(coordinates(ss), ss@data[,x])
  # make raster layer with model, raster is rwanda empty raster, model is m
  tps <- interpolate(r, m)
  tps <- crop(tps, rw)
  tps <- raster::mask(tps, rw) # cuts the tps raster down to the rw boundaries
  x <- gsub("m3.", "", x)
  
  palColors <- leaflet::colorNumeric(palette = "Reds", values(tps), na.color = "transparent")
  
  suppressWarnings(
  leaflet() %>% addTiles() %>%
  addRasterImage(tps, colors=palColors, opacity = 0.8) %>%
  setView(lng=rwanda$longitude, lat=rwanda$latitude, zoom=8) %>%
  addLegend(pal = palColors, values = values(tps), title = paste("Soil Value ", x, sep=""))
  )
})

save(maps,ss, file=paste(dd, "rw_baseline_interpolation_maps.Rdata", sep = "/"))

tagList(maps)

Print out the interpolated values for inclusion in the report.

pdf(file=paste(md, "rw_shs_bl_interpolation_soil_vars.pdf", sep = '/'), width=11, height=8.5)
lapply(soilVars, function(x) {
  m <- Tps(coordinates(ss), ss@data[,x])
  # make raster layer with model, raster is rwanda empty raster, model is m
  tps <- interpolate(r, m)
  tps <- crop(tps, rw)
  tps <- raster::mask(tps, rw) # cuts the tps raster down to the rw boundaries
  x <- gsub("m3.", "", x)
  
suppressMessages(
  outPlot <- invisible(print(
  plot(tps, main= paste("Soil TPS Interpolation ", x, sep="- ")),
  plot(rw, add=T, na.print=NULL)
  ))
)
})
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
dev.off()
## quartz_off_screen 
##                 2

–end